[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #136631)

Adam Yang via llvm-commits llvm-commits at lists.llvm.org
Tue May 6 20:19:00 PDT 2025


https://github.com/adam-yang updated https://github.com/llvm/llvm-project/pull/136631

>From a9464fadec85393f0344cba9c9e94b125f170445 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Fri, 18 Apr 2025 11:14:14 -0700
Subject: [PATCH 01/11] Adding remat piece by piece

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    4 +
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 1303 +++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     |  217 +++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |   62 +
 .../AMDGPUOccupancyAndLatencyHelper.cpp       |   18 +
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |   53 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   11 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    3 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |    4 +
 9 files changed, 1675 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..1ba8e3e2a54d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 0000000000000..70b25beeb22b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,1303 @@
+//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "AMDGPU.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "GCNRegPressure.h"
+
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+
+namespace {
+
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+
+struct RematNode {
+  enum class RematKind {
+    Candidate, // Not ready yet.
+    OneDefOneUse,
+    Clone,
+  };
+  RematNode()
+      : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(0) {}
+  RematNode(unsigned R, MachineInstr *MI, unsigned S)
+      : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
+        Kind(RematKind::Candidate), Size(S) {}
+  unsigned Reg;
+  MachineInstr *DefMI;
+  MachineBasicBlock *InsertBlock;
+  union {
+    MachineInstr *InsertPointMI;
+    unsigned UserCount;
+  };
+  RematKind Kind;
+  unsigned Size;
+};
+
+struct BlockLiveInfo {
+  MachineBasicBlock *BB;
+  unsigned MaxSReg;
+  unsigned MaxVReg;
+  // Input live is the live reg which cross block.
+  const GCNRPTracker::LiveRegSet InputLive;
+};
+
+struct RematStatus {
+  unsigned TargetOcc;
+  unsigned TargetVLimit;
+  unsigned TargetSLimit;
+  unsigned MaxVPressure;
+  unsigned MaxSPressure;
+  unsigned InputPhysicalVPressure;
+  unsigned InputPhysicalSPressure;
+  // More occupancy can help more than latency cost to reach It.
+  bool MemBound;
+  // abs(VTargetOcc-STargetOcc) > 1.
+  bool NotBalance;
+  DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+  // Collect MBBs which has memory write. When move instructions cross MBB, skip
+  // mem inst if the MBB has memory write. To make things fast, just check
+  // mayStore and isBarrier.
+  DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+  static char ID;
+
+  DenseSet<const MachineInstr *> TotalUniformInsts;
+  DenseSet<const MachineInstr *> SafeToRemoveInsts;
+  DenseSet<const MachineInstr *> DivergentInsts;
+  void removeInst(const MachineInstr *MI) {
+    TotalUniformInsts.erase(MI);
+    SafeToRemoveInsts.erase(MI);
+    DivergentInsts.erase(MI);
+  }
+
+  AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void applyRemat(MapVector<Register, RematNode> &RematMap,
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+    llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    MachineFunction &MF);
+  void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+    llvm::SlotIndexes *SlotIndexes,
+    const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII);
+  void applyCloneRemat(RematNode &Node,
+    std::vector<BlockLiveInfo> &HotBlocks,
+    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+    llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, MachineFunction &MF);
+  bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+    LiveIntervals *LIS, MachineDominatorTree *DT,
+    MachinePostDominatorTree *PDT, bool &IsNearTarget);
+
+  StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+    AU.addRequired<SlotIndexesWrapperPass>();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
+                std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+                llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+                const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                MachineFunction &MF) {
+  std::vector<RematNode> UpdateList;
+  for (auto &It : RematMap) {
+    UpdateList.emplace_back(It.second);
+  }
+  // Sort update list with slotIndex to make sure def moved before use.
+  // If use moved before def, It might not be the first use anymore.
+  std::sort(UpdateList.begin(), UpdateList.end(),
+            [&SlotIndexes](RematNode &I, RematNode &J) {
+              SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI);
+              SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI);
+              return A < B;
+            });
+
+  for (RematNode &Node : UpdateList) {
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
+    } else if (Node.Kind == RematNode::RematKind::Clone) {
+      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII,
+                      MF);
+    }
+  }
+}
+
+unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                            const GCNSubtarget *ST, unsigned &MaxVPressure,
+                            unsigned &MaxSPressure, RematStatus &Status) {
+  // Skip processing current block if It has only debug instructions
+  if (MBB.getFirstNonDebugInstr() == MBB.end())
+    return ST->getOccupancyWithNumVGPRs(0);
+  auto BBEnd = MBB.rbegin();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  if (!llvm::getNonDebugMBBEnd(BBEnd, MBB))
+    return ST->getOccupancyWithNumVGPRs(0);
+
+  GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB];
+  RPTracker.reset(*BBEnd, &OutputLive, true);
+
+  for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+    MachineInstr &MI = (*I++);
+    RPTracker.recede(MI);
+    if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+      Status.MemWriteMBBSet.insert(&MBB);
+  }
+
+  GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+  unsigned SPressure = RP.getMaxSGPR();
+  if (SPressure > MaxSPressure) {
+    MaxSPressure = SPressure;
+  }
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+    MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+  }
+  Status.MBBPressureMap[&MBB] = RP;
+  return RP.getOccupancy(*ST);
+}
+
+unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+                           const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST, unsigned &MaxVPressure,
+                           unsigned &MaxSPressure, RematStatus &Status) {
+  unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
+  // If only have one block, input/ouput virtual live set are empty.
+  if (MF.size() > 1) {
+    // Build input output live reg first.
+    auto *SlotIndexes = LIS->getSlotIndexes();
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+    for (MachineBasicBlock &MBB : MF) {
+      auto BBBegin = MBB.getFirstNonDebugInstr();
+      if (BBBegin != MBB.end()) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+        MBBInputSlotMap[&MBB] = SI;
+      }
+
+      auto BBEnd = MBB.rbegin();
+
+      // R.End doesn't point to the boundary instruction.
+      // Skip Debug instr.
+      if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+        MBBOutputSlotMap[&MBB] = SI;
+      }
+    }
+
+    for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+      auto Reg = Register::index2VirtReg(I);
+      if (!LIS->hasInterval(Reg))
+        continue;
+
+      const auto &LI = LIS->getInterval(Reg);
+
+      // Skip local live interval to make live input/ouput faster.
+      if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+        continue;
+
+      for (auto InputIt : MBBInputSlotMap) {
+        MachineBasicBlock *MBB = InputIt.first;
+        auto SI = InputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          Status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+      }
+
+      for (auto OutputIt : MBBOutputSlotMap) {
+        MachineBasicBlock *MBB = OutputIt.first;
+        auto SI = OutputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+      }
+    }
+  }
+
+  LLVM_DEBUG(
+      const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+      dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) {
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      } dbgs() << "input live";
+      for (auto &It : Status.MBBInputLiveMap) {
+        unsigned Idx = It.first->getNumber();
+        auto LiveReg = It.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      });
+
+  for (auto It = MF.begin(); It != MF.end(); ++It) {
+    MachineBasicBlock &MBB = *It;
+    unsigned Occ =
+        collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status);
+    if (TgtOcc > Occ)
+      TgtOcc = Occ;
+  }
+  return TgtOcc;
+}
+
+RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+                           LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST) {
+  unsigned MaxSPressure = 0;
+  unsigned MaxVPressure = 0;
+  RematStatus Status;
+  unsigned TgtOcc =
+      collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status);
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (TgtOcc >= MaxOcc) {
+    Status.TargetOcc = TgtOcc;
+    Status.TargetVLimit = 0;
+    Status.TargetSLimit = 0;
+    Status.MaxVPressure = 0;
+    Status.MaxSPressure = 0;
+    Status.InputPhysicalVPressure = 0;
+    Status.InputPhysicalSPressure = 0;
+    Status.MemBound = false;
+    Status.NotBalance = false;
+    return Status;
+  }
+
+  MaxSPressure += RegForVCC;
+  MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF));
+  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure);
+  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure);
+
+  llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI);
+  bool MemBound =
+      TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+  bool NotBalance = false;
+
+  const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+  // Currently, only sgpr bound can be fixed with remat.
+  if (STgtOcc < VTgtOcc) {
+    unsigned BigOcc = std::max(STgtOcc, VTgtOcc);
+    // Change TgtOcc to  in case sgpr and vgpr is not balance.
+    if (BigOcc > TgtOcc) {
+      TgtOcc = BigOcc;
+      NotBalance = true;
+      if (TgtOcc >= MaxOccupancy)
+        TgtOcc = MaxOccupancy - 1;
+    }
+  }
+
+  // Collect input physical pressure.
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  unsigned VInputPressure = 0;
+  uint64_t SInputMask = 0;
+  for (const auto &Livein : MRI.liveins()) {
+    const Register Reg = Livein.first;
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    assert(Reg.isPhysical() && "input must be physical reg");
+    unsigned RegSize = RC->getLaneMask().getNumLanes();
+    if (SIRI->isVGPR(MRI, Reg)) {
+      VInputPressure += RegSize;
+    } else {
+      unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+      uint64_t Mask = ((1 << RegSize) - 1) << RegIndex;
+      SInputMask |= Mask;
+    }
+  }
+  // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+  // pressure.
+  unsigned SInputPressure = 0;
+  uint64_t Mask = 0xf;
+  while (Mask != 0) {
+    if (Mask & SInputMask) {
+      SInputPressure += 4;
+    }
+    Mask = Mask << 4;
+  }
+
+  // If balanced, try next occupancy.
+  TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1);
+
+  auto CC = MF.getFunction().getCallingConv();
+  bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+  // For shader profiles other than ps/cs, set target profile max as 4.
+  if (!IsPsCs) {
+    TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+  }
+  if (TargetOccupancy)
+    TgtOcc = TargetOccupancy;
+
+  unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+  unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+  Status.TargetOcc = TgtOcc;
+  Status.TargetVLimit = VLimit;
+  Status.TargetSLimit = SLimit;
+  Status.MaxVPressure = MaxVPressure;
+  Status.MaxSPressure = MaxSPressure;
+  Status.InputPhysicalVPressure = VInputPressure;
+  Status.InputPhysicalSPressure = SInputPressure;
+  Status.MemBound = MemBound;
+  Status.NotBalance = NotBalance;
+  return Status;
+}
+
+// For case like
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+//   implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+//  xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+//  killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid It, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+  if (DefMI->getDesc().NumImplicitDefs == 0)
+    return false;
+
+  auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+  for (MachineOperand &Def : DefMI->implicit_operands()) {
+    if (!Def.isReg())
+      continue;
+    if (Def.isUse())
+      continue;
+    Register Reg = Def.getReg();
+    if (UseMI->readsRegister(Reg, TRI))
+      return true;
+  }
+  return false;
+}
+
+// SGPR has alignment requirment, cannot get accurate reg number.
+const unsigned NearTargetRegLimit = 10;
+bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
+                   MachineFunction &MF) {
+  unsigned MaxSGPR = ST->getAddressableNumSGPRs();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  Register ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg)
+    MaxSGPR -= 4;
+
+  const unsigned AlignmentDelta = 3;
+  MaxSGPR -= AlignmentDelta;
+
+  return MaxSPressure > MaxSGPR;
+}
+
+// Skip live reg remated to other block.
+void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
+                    GCNRPTracker::LiveRegSet &LiveSet,
+                    const GCNRPTracker::LiveRegSet &InputLive,
+                    MachineBasicBlock *CurBB,
+                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  for (auto &It : RematMap) {
+    unsigned Reg = It.first;
+    // Skip reg not in live set.
+    if (!LiveSet.count(Reg))
+      continue;
+    // Skip reg already in input set.
+    // Input set will be taken care in getReducedSize.
+    if (InputLive.count(Reg))
+      continue;
+
+    auto &Node = It.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+      // still before LiveInfo.BB, It is still live.
+      unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex > InsertBBIndex) {
+        continue;
+      }
+    }
+    // Already in remat map, don't need to check again, remove from
+    // candidate.
+    LiveSet.erase(Reg);
+  }
+}
+
+int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
+              const SIRegisterInfo *SIRI, bool IsVGPR) {
+  int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  for (MachineOperand &MO : DefMI->operands()) {
+    if (MO.isImm())
+      continue;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+    if (MO.isTied())
+      continue;
+
+    if (MO.getReg() == AMDGPU::EXEC)
+      continue;
+
+    // Don't move user of VCC.
+    if (MO.getReg() == AMDGPU::VCC) {
+      RematSize = 0;
+      break;
+    }
+    Register Reg = MO.getReg();
+
+    // Don't move physical register use.
+    if (Reg.isPhysical()) {
+      RematSize = 0;
+      break;
+    }
+
+    if (IsVGPR != SIRI->isVGPR(MRI, Reg)) {
+      // Not support mix of v and s when remat now.
+      // TODO: count possible pressure change here.
+      RematSize = 0;
+      break;
+    }
+    bool IsSingleDef = MRI.hasOneDef(Reg);
+    if (!IsSingleDef) {
+      IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI);
+    }
+
+    if (IsSingleDef) {
+      // The reg might share with other candidates,  check It here.
+      // Count share reg in getReducedSize.
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      if (unsigned SubIdx = MO.getSubReg()) {
+        if (OpRC)
+          OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+      }
+      int InputSize = SIRI->getRegSizeInBits(*OpRC);
+      // If input not live in hotspot, move It cross hotspot should have
+      // less reg then DefMi.
+      if (RematSize > InputSize) {
+        RematSize -= InputSize;
+        continue;
+      }
+    }
+
+    RematSize = 0;
+    break;
+  }
+  return RematSize;
+}
+
+MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB,
+                                        MachineDominatorTree *DT,
+                                        MachineLoopInfo *LI) {
+  while (LI->getLoopDepth(BB) > 0) {
+    MachineDomTreeNode *N = DT->getNode(BB);
+    if (N == nullptr)
+      return nullptr;
+    MachineDomTreeNode *IDom = N->getIDom();
+    if (IDom == nullptr)
+      return nullptr;
+
+    BB = IDom->getBlock();
+  }
+
+  return BB;
+}
+
+MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT,
+                                          BlockSet &Blocks) {
+  auto I = Blocks.begin(), E = Blocks.end();
+
+  MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+  while (I != E) {
+    MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+    DomB = DT->findNearestCommonDominator(DomB, B);
+    if (DomB == nullptr)
+      return nullptr;
+  }
+  // For split block like:
+  // bb.42:
+  //    %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+  //    //    implicit $exec
+  //  %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  //  implicitdef $scc, implicit $exec
+  //
+  // bb.68:
+  //; predecessors: %bb.42
+  //  successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+  //  %bb.43(50.00%)
+  //
+  //  SI_MASK_BRANCH %bb.43, implicit $exec
+  //  S_BRANCH %bb.45
+  // which is from
+  // bb.42:
+  //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+  //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  // SI_MASK_BRANCH %bb.43, implicit $exec
+  // S_BRANCH %bb.45
+  // The real common dom is bb.42.
+  // TODO: use _term version of exec update instructions so don't need this
+  // anymore.
+  if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+    // Upstreaming note: This used to be SI_MASK_BRANCH
+    if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+      MachineBasicBlock *Pred = *DomB->pred_begin();
+      if (Pred->succ_size() == 1 &&
+          (Pred->empty() || !Pred->back().isBranch())) {
+        DomB = Pred;
+      }
+    }
+  }
+
+  return DomB;
+}
+
+MachineBasicBlock *
+findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+                MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                const MachineRegisterInfo &MRI, bool MemBound) {
+
+  BlockSet BBSet;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    BBSet.insert(UseMI.getParent());
+  }
+  if (BBSet.size() == 0)
+    return nullptr;
+
+  MachineBasicBlock *BB = *BBSet.begin();
+  if (BBSet.size() > 1) {
+    MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet);
+    if (!BDom)
+      return nullptr;
+    BB = BDom;
+  }
+  // Try to find non loop dominator.
+  if (!MemBound) {
+    BB = findNonLoopDominator(BB, DT, MLI);
+  }
+  if (!BB)
+    return nullptr;
+
+  // If BB is already a hot block, move to BB will not help.
+  // hotBlockRemat will fail It when process BB.
+
+  // Must reachable from DefMI.
+  if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+    return nullptr;
+
+  return BB;
+}
+
+// Maybe expensive to be called all over the place
+bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  for (auto &Def : DefMI->defs()) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) {
+      if (UseMI.isPHI())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  // Do not move PHI nodes
+  if (isUsedByPhi(DefMI, MRI))
+    return false;
+
+  unsigned OpNum = DefMI->getNumOperands();
+  // Only move DefMI which all operand is unique def.
+  for (unsigned I = 0; I < OpNum; I++) {
+    MachineOperand &Op = DefMI->getOperand(I);
+    if (!Op.isReg())
+      continue;
+    if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+        !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void addOneDefOneUseCandidate(RematNode &Node,
+                              std::vector<RematNode> &RematList,
+                              MachineRegisterInfo &MRI, int &RematCnt,
+                              MachineDominatorTree *DT,
+                              MachinePostDominatorTree *PDT,
+                              MachineLoopInfo *MLI, bool IsVGPR,
+                              bool MemBound) {
+  unsigned Reg = Node.Reg;
+  MachineInstr *DefMI = Node.DefMI;
+
+  unsigned Size = Node.Size;
+  MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+  MachineBasicBlock *InsertBB = UseMI->getParent();
+
+  // For VGPR, always move next to the only user to avoid wqm or exec issue.
+  // But doing this will cause issue when DefMI is in wqm  user not in
+  // wqm. Disable VGPR remat for now.
+  // TODO: make sure single user don't need wqm.
+  if (!IsVGPR) {
+    if (MachineBasicBlock *NewInsertBB =
+            findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) {
+      if (InsertBB != NewInsertBB) {
+        InsertBB = NewInsertBB;
+        // If can find a non-loop insert block, go to the insert block.
+        if (DefMI->getParent() != InsertBB) {
+          if (!InsertBB->empty()) {
+            auto It = InsertBB->getFirstNonPHI();
+            It = skipDebugInstructionsForward(It, InsertBB->end());
+            if (It == InsertBB->end())
+              UseMI = nullptr;
+            else
+              UseMI = &*It;
+          }
+        }
+      }
+    }
+  }
+
+  if (IsVGPR) {
+    // Don't count reg in same block for valu.
+    if (UseMI->getParent() == DefMI->getParent())
+      return;
+  }
+
+  // Skip case when DefMI has implicit define which used by UseMI.
+  if (isImplicitDefUse(DefMI, UseMI)) {
+    return;
+  }
+
+  Node.InsertBlock = InsertBB;
+  Node.InsertPointMI = UseMI;
+  Node.Kind = RematNode::RematKind::OneDefOneUse;
+  RematList.emplace_back(Node);
+  RematCnt += Size;
+}
+
+void buildRematCandiates(std::vector<RematNode> &Candidates,
+                         GCNRPTracker::LiveRegSet &CandidateRegSet,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         const MachineRegisterInfo &MRI,
+                         const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+                         bool IsVGPR) {
+
+  for (auto LiveRegIt : CandidateRegSet) {
+    unsigned Reg = LiveRegIt.first;
+    // Skip unsafe reg.
+    if (PinnedRegSet.count(Reg))
+      continue;
+
+    if (SIRI->isVGPR(MRI, Reg) != IsVGPR)
+      continue;
+    bool IsSafeCandidate = true;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    if (MI) {
+      if (IsVGPR) {
+        // Only remat valu now.
+        if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+          IsSafeCandidate = false;
+        if (MI->getOpcode() == AMDGPU::COPY) {
+          // Make sure src is unique define.
+          if (MI->getOperand(1).isReg() &&
+              nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+            IsSafeCandidate = false;
+        } else {
+          // Skip convergent valu.
+          if (MI->isConvergent())
+            IsSafeCandidate = false;
+        }
+      }
+      // Skip inst has more than 1 def.
+      if (MI->getDesc().NumDefs > 1)
+        IsSafeCandidate = false;
+    } else {
+      IsSafeCandidate = false;
+    }
+
+    if (IsSafeCandidate) {
+      int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
+      if (Gain > 0) {
+        Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
+      } else {
+        IsSafeCandidate = false;
+      }
+    }
+    // Save unsafe reg.
+    if (!IsSafeCandidate)
+      PinnedRegSet.insert(Reg);
+  }
+
+  // Sort by gain.
+  std::sort(Candidates.begin(), Candidates.end(),
+            [](RematNode &I, RematNode &J) { return I.Size > J.Size; });
+}
+
+void addCloneCandidate(std::vector<RematNode *> &CloneList,
+                       std::vector<RematNode> &RematList,
+                       DenseSet<unsigned> &PinnedRegSet,
+                       MachineRegisterInfo &MRI, int &RematCnt) {
+  // Group user in same blocks.
+  std::vector<BlockSet> UserSetList(CloneList.size());
+
+  for (size_t I = 0; I < CloneList.size(); I++) {
+    auto *Node = CloneList[I];
+    unsigned Reg = Node->Reg;
+    MachineInstr *DefMI = Node->DefMI;
+    // Group user in same blocks.
+    BlockSet &UserSet = UserSetList[I];
+
+    for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+         UseIt != MRI.use_instr_nodbg_end();) {
+      MachineInstr &UseMI = *(UseIt++);
+      UserSet.insert(UseMI.getParent());
+    }
+
+    if (UserSet.size() == 1) {
+      // All users are in same block with DefMI.
+      if (*UserSet.begin() == DefMI->getParent()) {
+        // Mark cannot remat for now.
+        // TODO: try to split if is bigger than 4 and only used once per
+        // channel.
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+    }
+
+    int Size = Node->Size;
+    Size <<= 16;
+    // Pack userSet size to size.
+    Size |= UserSet.size();
+    Node->UserCount = Size;
+  }
+
+  std::sort(CloneList.begin(), CloneList.end(),
+            // Sort based on userSet size.
+            [](const RematNode *A, const RematNode *B) {
+              static constexpr int Mask = 0xffff;
+              return (A->UserCount & Mask) < (B->UserCount & Mask);
+            });
+
+  for (RematNode *Node : CloneList) {
+    Node->Kind = RematNode::RematKind::Clone;
+    RematList.emplace_back(*Node);
+    RematCnt += Node->Size;
+  }
+}
+
+int filterRematCandiates(std::vector<RematNode> &Candidates,
+                         std::vector<RematNode> &RematList,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         MachineDominatorTree *DT,
+                         MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                         MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) {
+  int RematCnt = 0;
+  // Work one def one use first.
+  for (auto &Node : Candidates) {
+    unsigned Reg = Node.Reg;
+    if (!MRI.hasOneNonDBGUse(Reg)) {
+      continue;
+    }
+    MachineInstr *DefMI = Node.DefMI;
+    if (!isSafeToMove(DefMI, MRI)) {
+      PinnedRegSet.insert(Reg);
+      continue;
+    }
+
+    addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI,
+                             IsVGPR, MemBound);
+  }
+
+  if (!IsVGPR) {
+    std::vector<RematNode *> CloneList;
+    // Try multi use case.
+    for (auto &Node : Candidates) {
+      unsigned Reg = Node.Reg;
+      if (MRI.hasOneNonDBGUse(Reg)) {
+        continue;
+      }
+      MachineInstr *DefMI = Node.DefMI;
+      if (!isSafeToMove(DefMI, MRI)) {
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+
+      // Clone for each user.
+      CloneList.emplace_back(&Node);
+    }
+
+    addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt);
+  }
+
+  return RematCnt;
+}
+
+int getReducedSize(MapVector<Register, RematNode> &RematMap,
+                   GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
+                   const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo,
+                   DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  int ReducedSize = 0;
+  for (auto &It : RematMap) {
+    Register Reg = It.first;
+
+    if (!CanidateSet.count(Reg))
+      continue;
+
+    bool IsReduced = false;
+    auto &Node = It.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+      // moved after LiveInfo.BB, It is not live anymore.
+      unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex < InsertBBIndex)
+        IsReduced = true;
+    } else {
+      // Clone.
+      IsReduced = true;
+      // If has use in LiveInfo.BB, could not reduce from input live.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        if (UseMI.getParent() == LiveInfo.BB) {
+          IsReduced = false;
+          break;
+        }
+      }
+    }
+    if (IsReduced) {
+      ReducedSize += Node.Size;
+      ReducedInsts.insert(Node.DefMI);
+    }
+
+    // Already in remat map, don't need to check again, remove from candidate.
+    CanidateSet.erase(Reg);
+  }
+
+  return ReducedSize;
+}
+
+int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
+
+  // Find shared operand in ReducedInsts.
+  int SharedSize = 0;
+  DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+  for (MachineInstr *DefMI : ReducedInsts) {
+    for (MachineOperand &MO : DefMI->operands()) {
+      if (MO.isImm())
+        continue;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      if (MO.isTied())
+        continue;
+      Register Reg = MO.getReg();
+
+      if (Reg == AMDGPU::EXEC)
+        continue;
+      if (!Reg.isVirtual())
+        continue;
+
+      if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) {
+        // Not support mix of v and s when remat now.
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+      unsigned Mask;
+      if (unsigned SubIdx = MO.getSubReg()) {
+        OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+        int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+        Mask = (1 << SubMOSize) - 1;
+      } else {
+        Mask = (1 << MOSize) - 1;
+      }
+      auto SharedRegIt = SharedRegMaskMap.find(Reg);
+      if (SharedRegIt == SharedRegMaskMap.end()) {
+        SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+      } else {
+        unsigned PrevMask = SharedRegIt->second.getAsInteger();
+        if (unsigned SharedMask = (PrevMask & Mask)) {
+          // Some thing is shared.
+          for (int I = 0; I < MOSize; I++) {
+            if (SharedMask & (1 << I)) {
+              SharedSize += 1;
+            }
+          }
+        }
+        LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+        SharedRegMaskMap[Reg] = MoMask;
+      }
+    }
+  }
+  return SharedSize;
+}
+
+void dumpRematMap(MapVector<Register, RematNode> &RematMap,
+                  const SIRegisterInfo *SIRI) {
+  dbgs() << "\n rematMap: \n";
+  for (auto It : RematMap) {
+    int Reg = It.first;
+    dbgs() << printReg(Reg, SIRI);
+    dbgs() << "\n";
+  }
+}
+int DebugBlockIndex = 42;
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+                  MapVector<Register, RematNode> &VRematMap,
+                  MapVector<Register, RematNode> &SRematMap, int BlockIndex,
+                  const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  llvm::dumpLiveSet(LiveSet, SIRI);
+  dumpRematMap(VRematMap, SIRI);
+  dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+                    const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  dbgs() << "\n Candidates: \n";
+  unsigned TotalSize = 0;
+  for (RematNode &Node : RematCandidates) {
+    dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+    dbgs() << "\n";
+    TotalSize += Node.Size;
+  }
+  dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
+                   LiveIntervals *LIS, MachineDominatorTree *DT,
+                   MachinePostDominatorTree *PDT, bool &IsNearTarget) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
+  for (MachineBasicBlock *MBB : RPOT) {
+    RPOTIndexMap[MBB] = RPOTIndexMap.size();
+  }
+
+  auto &MRI = MF.getRegInfo();
+
+  bool IsUpdated = false;
+  RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (Status.TargetOcc >= MaxOcc)
+    return false;
+
+  unsigned VLimit = Status.TargetVLimit;
+  unsigned SLimit = Status.TargetSLimit;
+
+  int RematSCnt = Status.MaxSPressure - SLimit;
+
+  bool IsSGPRSpill = false;
+  if (RematSCnt > 0) {
+    IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
+  }
+
+  const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
+
+  // If bound by lds, skip.
+  if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+      !IsForceRematSgpr)
+    return false;
+
+  MachineBasicBlock *EntryMBB = &MF.front();
+
+  auto *SlotIndexes = LIS->getSlotIndexes();
+
+  // Reg which already marked remat.
+  MapVector<Register, RematNode> VRematMap;
+  MapVector<Register, RematNode> SRematMap;
+  // Reg which cannot move around to remat.
+  DenseSet<unsigned> PinnedRegSet;
+  std::vector<BlockLiveInfo> HotBlocks;
+  for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) {
+    MachineBasicBlock *MBB = *It;
+    auto &RP = Status.MBBPressureMap[MBB];
+    // ignore block not hot.
+    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit &&
+        (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) <
+            Status.TargetSLimit)
+      continue;
+    // Collect reg pressure.
+    unsigned MaxVPressure = 0;
+    unsigned MaxSPressure = 0;
+    const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB];
+    LLVM_DEBUG(
+        dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+    GCNDownwardRPTracker Tracker(*LIS);
+
+    Tracker.reset(*MBB->begin(), &InputLive);
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      Tracker.advance();
+      auto LISLR = Tracker.getLiveRegs();
+      // Update live set for things already remated.
+      updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+      updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap);
+
+      const GCNRPTracker::LiveRegSet &LiveSet = LISLR;
+      unsigned VPressure = 0;
+      unsigned SPressure = 0;
+      collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure);
+      if (MaxVPressure < VPressure)
+        MaxVPressure = VPressure;
+      if (MaxSPressure < SPressure)
+        MaxSPressure = SPressure;
+    }
+    MaxSPressure += RegForVCC + Status.InputPhysicalSPressure;
+    if (MaxVPressure <= VLimit && MaxSPressure <= SLimit)
+      continue;
+
+    // Build block live info.
+    // Use outputLive for EntryMBB.
+    BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure,
+                              MBB != EntryMBB ? InputLive : OutputLive};
+    // Skip entry block when save hotBlock to reduce clone because not clone in
+    // entry block.
+    if (MBB != EntryMBB)
+      HotBlocks.emplace_back(LiveInfo);
+    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive;
+
+    // Update reg pressure based on remat list.
+    InstSet VReducedInsts;
+    InstSet SReducedInsts;
+    int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI,
+                                  LiveInfo, RPOTIndexMap);
+    int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI,
+                                  LiveInfo, RPOTIndexMap);
+
+    // Calculate size need to be remat.
+    int RematVCnt = MaxVPressure - VReduced - VLimit;
+    int RematSCnt = MaxSPressure - SReduced - SLimit;
+
+    bool IsSGPRSpill = false;
+    if (RematSCnt > 0) {
+      IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
+    }
+    bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
+    // Try to add candidates into remat list.
+
+    int NewRematSCnt = 0;
+    if (RematSCnt > 0) {
+      // Build candidate nodes.
+      std::vector<RematNode> SRematCandidates;
+      buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+                          SIII, SIRI, /*IsVGPR*/ false);
+
+      LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+      std::vector<RematNode> SRematList;
+      // Filter candidates.
+      NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
+                                          PinnedRegSet, DT, PDT, MLI, MRI,
+                                          /*IsVGPR*/ false, Status.MemBound);
+      if (NewRematSCnt > RematSCnt) {
+        // Has enough remat node to cover rematCnt.
+        int RematCnt = 0;
+        for (RematNode &Node : SRematList) {
+          SRematMap[Node.Reg] = Node;
+          RematCnt += Node.Size;
+          if (RematCnt > RematSCnt)
+            break;
+        }
+        NewRematSCnt = 0;
+      } else {
+
+        for (RematNode &Node : SRematList) {
+          SReducedInsts.insert(Node.DefMI);
+        }
+        // Check shared size.
+        int SharedReducedSize =
+            getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
+        if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+            RematSCnt) {
+          for (RematNode &Node : SRematList) {
+            SRematMap[Node.Reg] = Node;
+          }
+        } else {
+          if (!IsForceRematSgpr)
+            return false;
+          for (RematNode &Node : SRematList) {
+            SRematMap[Node.Reg] = Node;
+          }
+          // Find local one def one use candidates.
+          for (MachineInstr &MI : *MBB) {
+            if (MI.isDebugInstr())
+              continue;
+            if (MI.getDesc().NumDefs != 1)
+              continue;
+            MachineOperand &DstMO = MI.getOperand(0);
+            Register Reg = DstMO.getReg();
+            if (!SIRI->isSGPRReg(MRI, Reg))
+              continue;
+            if (!MRI.hasOneNonDBGUse(Reg))
+              continue;
+            if (!MRI.hasOneDef(Reg))
+              continue;
+            if (Reg.isPhysical())
+              continue;
+            MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+            if (UseMI.getParent() != MBB)
+              continue;
+            int Gain = rematGain(&MI, Reg, MRI, SIRI,
+                                 /*IsVGPR*/ false);
+            if (Gain > 0) {
+              // Skip case when DefMI has implicit define which used by UseMI.
+              if (isImplicitDefUse(&MI, &UseMI)) {
+                continue;
+              }
+              RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
+              Node.InsertPointMI = &UseMI;
+              Node.Kind = RematNode::RematKind::OneDefOneUse;
+              SRematMap[Reg] = Node;
+              SharedReducedSize += Node.Size;
+            }
+          }
+        }
+        NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize;
+      }
+    }
+    // If works, continue.
+
+    // Collect live range from hot inst.
+    // find common live range in hot insts.
+    // Remat these common live range.
+    // Apply the remat.
+
+    int NewRematVCnt = 0;
+    if (RematVCnt > 0) {
+      // TODO: V remat.
+    }
+
+    bool NeedSRemat = RematSCnt > 0;
+    bool NeedVRemat = RematVCnt > 0;
+    // If sgpr spill, always do remat.
+    bool IsSRematOK =
+        (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr;
+    bool IsVRematOK =
+        (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
+    if (NeedSRemat && NeedVRemat) {
+      if (IsVRematOK && IsSRematOK) {
+        IsUpdated = true;
+      } else if (IsSGPRSpill) {
+        IsUpdated = true;
+      }
+    } else if (NeedSRemat) {
+      if (IsSRematOK) {
+        IsUpdated = true;
+      }
+    } else if (NeedVRemat) {
+      if (IsVRematOK) {
+        IsUpdated = true;
+      }
+    }
+    // TODO: what to do when cannot reach target?
+    if (NewRematSCnt > 0) {
+      if ((unsigned)NewRematSCnt <= NearTargetRegLimit) {
+        IsNearTarget = true;
+      } else {
+        if (!IsSGPRSpill)
+          return false;
+      }
+    }
+  }
+
+  if (SRematMap.empty() && VRematMap.empty()) {
+    return IsUpdated;
+  }
+
+  if (!SRematMap.empty()) {
+    IsUpdated = true;
+    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
+               MF);
+    LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+  }
+
+  // Balance between vector and scalar if possible.
+  return IsUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.size() < 2)
+    return false;
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT =
+      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+
+  bool IsNearTarget = false;
+  return hotBlockRemat(MF, MLI, LIS, DT, PDT, IsNearTarget);
+}
+
+} // namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                      "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                    "AMDGPU rematerialize", false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+  return new AMDGPUHotBlockRematerialize();
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 0000000000000..dc8b67e368516
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,217 @@
+//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRUtils.h"
+#include "SIRegisterInfo.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+
+namespace llvm {
+bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+                       MachineBasicBlock &MBB) {
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+    BBEnd++;
+  return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+                    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!StartMI || !EndMI)
+    return false;
+  // is local when parent MBB the same.
+  bool IsSameMBB = StartMI->getParent() == EndMI->getParent();
+  if (!IsSameMBB)
+    return false;
+  // Collect touched MBB.
+  MachineBasicBlock *MBB = StartMI->getParent();
+  TouchedMBBSet.insert(MBB);
+  return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+                      SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet))
+      return false;
+  }
+  return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!StartMI || !EndMI)
+    return false;
+  // is local when parent MBB the same.
+  return StartMI->getParent() == EndMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes))
+      return false;
+  }
+  return true;
+}
+
+} // namespace
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// TouchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool llvm::isLocalLiveInterval(
+    const LiveInterval &LI, SlotIndexes *Indexes,
+    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
+}
+
+bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes);
+}
+
+void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+
+  dbgs() << "\n live set: \n";
+  for (auto It : LiveSet) {
+    int Reg = It.first;
+    dbgs() << printReg(Reg, SIRI);
+    if (It.second.any()) {
+      dbgs() << " mask:" << It.second.getAsInteger();
+    }
+    dbgs() << "\n";
+  }
+}
+
+namespace llvm {
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI) {
+  unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  Size >>= 5;
+  if (Mask.any()) {
+    if (unsigned MaskSize = Mask.getNumLanes()) {
+      if (MaskSize < Size)
+        Size = MaskSize;
+    }
+  }
+  return Size;
+}
+
+void collectLiveSetPressure(const LiveSet &LiveSet,
+                            const MachineRegisterInfo &MRI,
+                            const SIRegisterInfo *SIRI, unsigned &VPressure,
+                            unsigned &SPressure) {
+  VPressure = 0;
+  SPressure = 0;
+  for (auto LiveIt : LiveSet) {
+    unsigned Reg = LiveIt.first;
+    unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg)) {
+      VPressure += Size;
+    } else {
+      SPressure += Size;
+    }
+  }
+}
+
+bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  // Support multi def for pattern of pointer:
+  // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+  // %808.sub1:sgpr_64 = S_MOV_B32 0
+  bool HasSub0 = false;
+  bool HasSub1 = false;
+  for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+    if (unsigned SubReg = UserDefMO.getSubReg()) {
+      bool IsSingleSubReg = false;
+      switch (SubReg) {
+      default:
+        break;
+      case AMDGPU::sub0:
+        if (!HasSub0) {
+          HasSub0 = true;
+          IsSingleSubReg = true;
+        }
+        break;
+      case AMDGPU::sub1:
+        if (!HasSub1) {
+          HasSub1 = true;
+          IsSingleSubReg = true;
+        }
+        break;
+      }
+      if (!IsSingleSubReg) {
+        HasSub0 = false;
+        break;
+      }
+    } else {
+      HasSub0 = false;
+      break;
+    }
+  }
+
+  return (HasSub0 && HasSub1);
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+                 MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                 MachineBasicBlock *ToBB) {
+  if (FromBB == ToBB) {
+    return true;
+  }
+
+  if (DT->dominates(FromBB, ToBB)) {
+    return true;
+  }
+
+  if (PDT->dominates(ToBB, FromBB)) {
+    return true;
+  }
+
+  if (loopContainsBoth(LI, ToBB, FromBB)) {
+    return true;
+  }
+  // TODO: cover case hotBB in loop,
+  //       one block in that loop dom BB or
+  //       BB post dom one block in that loop.
+  return false;
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 0000000000000..c4452c91a43a8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,62 @@
+//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for MIR passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class LiveInterval;
+class SlotIndexes;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class MachineDominatorTree;
+class MachinePostDominatorTree;
+
+constexpr unsigned RegForVCC = 2;
+
+bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+                       llvm::MachineBasicBlock &MBB);
+
+// Check if LI live cross basic blocks, save all touched basic block if is
+// local.
+bool isLocalLiveInterval(
+    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
+    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &TouchedMBBSet);
+bool isLocalLiveInterval(const llvm::LiveInterval &LI,
+                         llvm::SlotIndexes *Indexes);
+
+bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+  const llvm::MachineRegisterInfo &MRI,
+  const llvm::SIRegisterInfo *SIRI);
+void collectLiveSetPressure(const LiveSet &LiveSet,
+                            const llvm::MachineRegisterInfo &MRI,
+                            const llvm::SIRegisterInfo *SIRI,
+                            unsigned &VPressure, unsigned &SPressure);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+                 llvm::MachineDominatorTree *DT,
+                 llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+                 llvm::MachineBasicBlock *ToBB);
+}
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 0000000000000..32301130606a7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,18 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==------------------------------------------------------------------------==//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//==------------------------------------------------------------------------==//
+
+namespace llvm {
+}
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 0000000000000..f9be0a2c73d86
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,53 @@
+//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
+
+namespace llvm {
+
+class MachineFunction;
+class GCNSubtarget;
+class MachineLoopInfo;
+
+struct SchedScore {
+  // Score for this Sched result.
+  unsigned Occupancy = 0;
+  bool SgprSpill = false;
+  unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+  unsigned MemLatency = 0;  // Only save mem latency.
+                            // We want mem latency small and hide big. Compare
+                            // memLatency - hide * Occ, smaller is better.
+  unsigned MixAlu = 0;      // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+  unsigned Lds = 0; // Todo: count lds.
+  SchedScore() {}
+
+  // Other info which can help compare schedule result.
+  float computeScore() const;
+  float computeScore2() const;
+
+  void sum(const SchedScore &S, unsigned LoopDepth = 0);
+  bool isBetter(const SchedScore &S) const;
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
+  // More latency can be hiden with ExtraOcc.
+  unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+SchedScore collectLatency(llvm::MachineFunction &MF,
+                          const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI = nullptr);
+
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 90e3489ced923..9c1aec6cd047d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -397,6 +397,12 @@ static cl::opt<bool>
                          cl::desc("Enable s_delay_alu insertion"),
                          cl::init(true), cl::Hidden);
 
+// Enable Hot block rematerialize
+static cl::opt<bool>
+    EnableHotBlockRemat("amdgpu-enable-hot-block-remat",
+                        cl::desc("Enable HotBlock Rematerialize optimization"),
+                        cl::init(false), cl::Hidden);
+
 // Enable GFX11+ VOPD
 static cl::opt<bool>
     EnableVOPD("amdgpu-enable-vopd",
@@ -521,6 +527,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+  initializeAMDGPUHotBlockRematerializePass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
@@ -1539,6 +1546,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
+  // Rematerialize must be run before phi elimination
+  if (isPassEnabled(EnableHotBlockRemat))
+    addPass(&AMDGPUHotBlockRematerializeID);
+
   TargetPassConfig::addOptimizedRegAlloc();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..79fdbba1d0db1 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRematerialize.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
@@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
+  AMDGPUMIRUtils.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
+  AMDGPUOccupancyAndLatencyHelper.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcb..aa4b3f948b726 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
+  unsigned getMaxSGPR() const {
+    return std::max(getSGPRNum(), getSGPRTuplesWeight());
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR32]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p

>From 6854976b4d2ae4af1d3caba6ef2b5c39c7925d2d Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Fri, 18 Apr 2025 15:24:29 -0700
Subject: [PATCH 02/11] First build

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 237 ++++++++-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 467 +++++++++++++++++-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  40 ++
 .../AMDGPUOccupancyAndLatencyHelper.cpp       | 151 ++++++
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |  27 +
 5 files changed, 909 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 70b25beeb22b9..95237062a6093 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -37,6 +37,7 @@ namespace {
 
 typedef DenseSet<MachineInstr *> InstSet;
 typedef DenseSet<MachineBasicBlock *> BlockSet;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
 
 struct RematNode {
   enum class RematKind {
@@ -107,20 +108,17 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
   AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void applyCloneRemat(RematNode &Node,
+    std::vector<BlockLiveInfo> &HotBlocks,
+    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+    SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, MachineFunction &MF);
   void applyRemat(MapVector<Register, RematNode> &RematMap,
     std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
     llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     MachineFunction &MF);
-  void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
-    llvm::SlotIndexes *SlotIndexes,
-    const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII);
-  void applyCloneRemat(RematNode &Node,
-    std::vector<BlockLiveInfo> &HotBlocks,
-    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
-    llvm::SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, MachineFunction &MF);
   bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
     LiveIntervals *LIS, MachineDominatorTree *DT,
     MachinePostDominatorTree *PDT, bool &IsNearTarget);
@@ -138,6 +136,227 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
   }
 };
 
+MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
+    MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  const bool WillSmashScc =
+      InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+  }
+
+  return CurrentInsertPoint;
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+    DenseSet<MachineBasicBlock *> &UserMBBSet,
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> HotBlockSet;
+  for (BlockLiveInfo &HotBlock : HotBlocks) {
+    if (HotBlock.InputLive.count(Reg)) {
+      HotBlockSet.insert(HotBlock.BB);
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> AfterHotRangeMBBs;
+  for (MachineBasicBlock *MBB : UserMBBSet) {
+    // Always clone in hot block.
+    if (HotBlockSet.count(MBB))
+      continue;
+
+    bool IsDomAllHotBlocks = true;
+    bool IsDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *HotMBB : HotBlockSet) {
+      if (!DT->dominates(MBB, HotMBB)) {
+        IsDomAllHotBlocks = false;
+      }
+      if (!DT->dominates(HotMBB, MBB)) {
+        IsDomedByAllHotBlocks = false;
+      }
+      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
+        break;
+      }
+    }
+    if (IsDomAllHotBlocks) {
+      UserBlocks.erase(MBB);
+    } else if (IsDomedByAllHotBlocks) {
+      AfterHotRangeMBBs.insert(MBB);
+    }
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!AfterHotRangeMBBs.empty()) {
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) {
+        if (MBB == MBB2)
+          continue;
+        if (DT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (MachineBasicBlock *MBB : AfterHotRangeMBBs) {
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *DomedMBB : Dom) {
+        // Remove domedMBB.
+        DomMap.erase(DomedMBB);
+        UserMBBSet.erase(DomedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
+                 SmallVector<MachineInstr *, 2> &UserMIs) {
+  for (MachineInstr *UseMI : UserMIs) {
+    for (MachineOperand &MO : UseMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == Reg) {
+        MO.setReg(NewReg);
+        if (IsSubRegDef)
+          MO.setSubReg(0);
+      }
+    }
+  }
+}
+
+void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node,
+                     std::vector<BlockLiveInfo> &HotBlocks,
+                     MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+                     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, MachineFunction &MF) {
+  unsigned Reg = Node.Reg;
+
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+  auto DefOp = DefMI->getOperand(0);
+  const MCInstrDesc &Desc = DefMI->getDesc();
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  // When the unique def has subReg, just create newReg for the subReg part.
+  bool IsSubRegDef = false;
+  if (DefOp.getSubReg() != 0) {
+    RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+    IsSubRegDef = true;
+  }
+  const DebugLoc DL = DefMI->getDebugLoc();
+  unsigned OpNum = DefMI->getNumOperands();
+
+  Node.Kind = RematNode::RematKind::Clone;
+
+  // Group user in same blocks.
+  BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+  DenseSet<MachineBasicBlock *> UserMBBSet;
+  for (auto UseIt = MRI.use_instr_nodbg_begin(Reg);
+       UseIt != MRI.use_instr_nodbg_end();) {
+    MachineInstr &UseMI = *(UseIt++);
+    UserMap[UseMI.getParent()].emplace_back(&UseMI);
+    UserMBBSet.insert(UseMI.getParent());
+  }
+
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT);
+
+  for (auto UseIt : UserMap) {
+    MachineBasicBlock *MBB = UseIt.first;
+    // Skip same block uses.
+    if (MBB == DefMI->getParent()) {
+      continue;
+    }
+    // Skip MBB which share clone from other MBBs.
+    if (UserMBBSet.count(MBB) == 0)
+      continue;
+
+    Register NewReg = MRI.createVirtualRegister(RC);
+    auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+    for (unsigned I = 1; I < OpNum; I++) {
+      NewDef = NewDef.add(DefMI->getOperand(I));
+    }
+
+    MachineInstr *InsertPointMI = UseIt.second.front();
+    SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+    for (MachineInstr *UseMI : UseIt.second) {
+      SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI);
+      if (LastSlot > Slot) {
+        LastSlot = Slot;
+        InsertPointMI = UseMI;
+      }
+    }
+
+    MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
+
+    for (MachineMemOperand *MO : DefMI->memoperands()) {
+      NewDef->addMemOperand(MF, MO);
+    }
+
+    MBB->insert(InsertPoint, NewDef);
+
+    SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+    SmallVector<MachineInstr *, 2> &UserMIs = UseIt.second;
+    updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+
+    // update users in dom MBBs.
+    auto DomMapIt = DomMap.find(MBB);
+    if (DomMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : DomMapIt->second) {
+        SmallVector<MachineInstr *, 2> &UserMIs = UserMap[UpdateMBB];
+        updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
+      }
+    }
+
+    llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+  }
+  if (MRI.use_empty(Reg)) {
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  }
+}
+
+void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+                            SlotIndexes *SlotIndexes,
+                            const SIRegisterInfo *SIRI,
+                            const SIInstrInfo *SIII) {
+  MachineInstr *DefMI = Node.DefMI;
+  MachineInstr *InsertPointMI = Node.InsertPointMI;
+  MachineBasicBlock *MBB = nullptr;
+
+  // Find a valid insert point.
+  MachineBasicBlock::iterator InsertPoint;
+  if (InsertPointMI) {
+    InsertPoint = InsertPointMI->getIterator();
+    MBB = InsertPointMI->getParent();
+  } else {
+    InsertPoint = Node.InsertBlock->getFirstTerminator();
+    MBB = Node.InsertBlock;
+  }
+
+  InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+                                                 SIRI, SIII);
+
+  // Move instruction to new location.
+  DefMI->removeFromParent();
+  InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+  // Update slot index.
+  SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  SlotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
 void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
                 std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
                 llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index dc8b67e368516..6d6bd38c61c06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 
 #define DEBUG_TYPE "xb-mir-util"
 using namespace llvm;
@@ -79,14 +80,132 @@ bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
   return true;
 }
 
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+                                    const MachineBasicBlock *BB) {
+  const MachineLoop *L = LI->getLoopFor(BB);
+  if (L) {
+    while (const MachineLoop *Parent = L->getParentLoop())
+      L = Parent;
+  }
+  return L;
+}
+
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+                      const MachineBasicBlock *BB2) {
+  const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+  const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+  return L1 != nullptr && L1 == L2;
+}
+
 } // namespace
 
+
+namespace llvm {
+
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
+                       llvm::MachineBasicBlock::iterator MI) {
+  const TargetRegisterInfo *TRI =
+      MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+  for (auto It = MI; It != MBB->end(); ++It) {
+    const MachineInstr &CurMI = *It;
+    // Hit use of scc, it is live.
+    if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+      return true;
+    // Hit def of scc first, not live.
+    if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+      return false;
+  }
+  // Reach the end of MBB, check live-ins of MBB successors.
+  for (const MachineBasicBlock *Succ : MBB->successors()) {
+    if (Succ->isLiveIn(AMDGPU::SCC))
+      return true;
+  }
+  return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+    MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+  // If SCC is dead at MI when we can use MI as the insert point.
+  if (!llvm::isSccLiveAt(MBB, MI)) {
+    return MI;
+  }
+
+  const bool CheckForExecWrite =
+      Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+  // Get the starting reverse iterator taking care to handle the MBB->end()
+  // case.
+  MachineBasicBlock::reverse_iterator Start;
+  if (MI == MBB->end()) {
+    Start = MBB->rbegin();
+  } else {
+    Start = MI.getReverse();
+  }
+
+  // Otherwise, walk backwards through the block looking for a location where
+  // SCC is dead.
+  for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+       It != End; ++It) {
+    // If the instruction modifies exec then we cannot use it as
+    // an insertion point (if that is a constraint from the caller).
+    // The check for EXEC works for both wave64 and wave32 because
+    // it will also catch Writes to the subregisters (e.g. exec_lo).
+    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+      break;
+    }
+
+    if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+        !It->readsRegister(AMDGPU::SCC, TRI)) {
+      return It->getIterator();
+    }
+  }
+
+  // If no safe location can be found in the block we can save and restore
+  // SCC around MI. There is no way to directly read or Write SCC so we use
+  // s_cselect to read the current value of SCC and s_cmp to Write the saved
+  // value back to SCC.
+  //
+  // The generated code will look like this;
+  //
+  //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+  //      <----- Newly created safe insert point.
+  //      MI
+  //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+  //
+  Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+      .addImm(-1)
+      .addImm(0);
+  BuildMI(*MBB, std::next(MI->getIterator()), DL,
+          TII->get(AMDGPU::S_CMP_LG_U32))
+      .addReg(TmpScc, RegState::Kill)
+      .addImm(0);
+
+  return MI;
+}
+
 // In case like float4 v, v.x used and defined in one block, v.y used and define
 // in another block, one live interval could touch more than one MBB.
 // TouchedMBBSet is used for scheduling where local live interval could cross
 // multiple regions, need to calculate livereg for each region inside touched
 // MBB.
-bool llvm::isLocalLiveInterval(
+bool isLocalLiveInterval(
     const LiveInterval &LI, SlotIndexes *Indexes,
     SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
   if (LI.hasSubRanges()) {
@@ -98,7 +217,7 @@ bool llvm::isLocalLiveInterval(
   return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
 }
 
-bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges()) {
       if (!isLocalLiveRange(&S, Indexes))
@@ -108,7 +227,7 @@ bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
   return isLocalLiveRange(&LI, Indexes);
 }
 
-void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
+void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
 
   dbgs() << "\n live set: \n";
   for (auto It : LiveSet) {
@@ -121,7 +240,347 @@ void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
   }
 }
 
-namespace llvm {
+LaneBitmask getRegMask(const MachineOperand &MO,
+                       const MachineRegisterInfo &MRI) {
+  // We don't rely on read-undef_ flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0
+             ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+             : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+                   MO.getSubReg());
+}
+
+struct Piece {
+  unsigned Reg;
+  unsigned Offset;
+  unsigned Size;
+  static SmallVector<Piece, 8> split(std::bitset<32> Mask) {
+
+    SmallVector<Piece, 8> Pieces;
+    Piece Piece = {0, 0, 0};
+    for (unsigned i = 0; i < 32; i++) {
+      if (Mask.test(i)) {
+        if (Piece.Size == 0)
+          Piece.Offset = i;
+
+        Piece.Size++;
+        // Make sure no piece bigger than 8.
+        if (Piece.Size == 8) {
+          Pieces.emplace_back(Piece);
+          Piece.Size = 0;
+        }
+      } else {
+        if (Piece.Size == 0) {
+          continue;
+        }
+        Pieces.emplace_back(Piece);
+        Piece.Size = 0;
+      }
+    }
+    return Pieces;
+  }
+};
+
+static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI,
+                                      const MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+  const TargetRegisterClass *SubregRC =
+      SIRI->getSubRegisterClass(RC, AMDGPU::sub0);
+  return SubregRC->LaneMask.getNumLanes();
+}
+
+static std::vector<unsigned>
+getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI,
+                                          const TargetRegisterClass *RC,
+                                          LaneBitmask Mask) {
+  // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+  // First pass: Try to find a perfectly matching subregister index.
+  // If none exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  const LaneBitmask Avoid = ~Mask;
+  {
+    unsigned BestCover = 0;
+    for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
+      // Is this index even compatible with the given class?
+      if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
+        continue;
+      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == Mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // The index must not cover any lanes outside
+      if ((SubRegMask & Avoid).any())
+        continue;
+
+      unsigned PopCount = SubRegMask.getNumLanes();
+      PossibleIndexes.push_back(Idx);
+      if (PopCount > BestCover) {
+        BestCover = PopCount;
+        BestIdx = Idx;
+      }
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0) {
+    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                      << TRI->getRegClassName(RC) << " mask "
+                      << PrintLaneMask(Mask) << '\n');
+    assert(false && "Impossible to span reg class");
+    return std::vector<unsigned>();
+  }
+
+  std::vector<unsigned> Result;
+  Result.push_back(BestIdx);
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx));
+  while (Mask.any()) {
+    BestIdx = 0;
+    int BestCover = std::numeric_limits<int>::min();
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == Mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Guaranteed above
+      assert((SubRegMask & Avoid).none());
+
+      // Try to cover as much of the remaining lanes as possible but as few of
+      // the already covered lanes as possible.
+      int Cover = (SubRegMask & Mask).getNumLanes() -
+                  (SubRegMask & ~Mask).getNumLanes();
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0) {
+      LLVM_DEBUG(
+          dbgs() << "Unable to find minimal spanning sub register(s) for "
+                 << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
+                 << '\n');
+      assert(false && "Impossible to span reg class");
+      return std::vector<unsigned>();
+    }
+
+    Result.push_back(BestIdx);
+    Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return Result;
+}
+
+static void updateSubReg(MachineOperand &UseMO,
+                         const llvm::TargetRegisterClass *NewRC,
+                         unsigned Offset, const SIRegisterInfo *SIRI) {
+  unsigned Size = NewRC->getLaneMask().getNumLanes();
+  if (Size == 1) {
+    UseMO.setSubReg(0);
+  } else {
+    const uint32_t SubReg = UseMO.getSubReg();
+    LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg);
+
+    unsigned Mask = LaneMask.getAsInteger() >> Offset;
+
+    unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask(
+                             SIRI, NewRC, LaneBitmask(Mask))
+                             .front();
+
+    UseMO.setSubReg(NewSubReg);
+  }
+}
+
+bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
+                   MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
+  MachineOperand &DstMO = MI.getOperand(0);
+  // Skip case when dst subReg not 0.
+  if (DstMO.getSubReg()) {
+    return false;
+  }
+  Register Reg = DstMO.getReg();
+
+  SmallVector<MachineOperand *, 2> UseMOs;
+  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+    UseMOs.emplace_back(&UseMO);
+  }
+
+  const llvm::TargetRegisterClass *NewRC =
+      SIRI->getRegClass(Desc.operands().front().RegClass);
+  if (!NewRC->isAllocatable()) {
+    if (SIRI->isSGPRClass(NewRC))
+      NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+    else if (SIRI->isVGPRClass(NewRC))
+      NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
+    else
+      return false;
+
+    if (!NewRC->isAllocatable())
+      return false;
+  }
+
+  unsigned NumLanes = NewRC->getLaneMask().getNumLanes();
+  if (Offset > 0) {
+    // Update offset operand in MI.
+    MachineOperand *OffsetOp =
+        SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
+
+    const uint32_t LaneSize = sizeof(uint32_t);
+    if (OffsetOp) {
+      if (OffsetOp->isImm()) {
+        assert(OffsetOp != nullptr);
+        int64_t Offset = OffsetOp->getImm();
+        Offset += Offset * LaneSize;
+        if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+          return false;
+        }
+        OffsetOp->setImm(Offset);
+      } else {
+        return false;
+      }
+    } else {
+      OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+      if (OffsetOp) {
+        Register NewOffsetReg =
+            MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
+                                 SIII->get(AMDGPU::S_ADD_U32))
+                             .addDef(NewOffsetReg)
+                             .add(*OffsetOp)
+                             .addImm(Offset * LaneSize);
+        MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
+        MachineBasicBlock::iterator InsertPoint =
+            llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+                                                      SIII, &MRI);
+        MI.getParent()->insert(InsertPoint, OffsetAddMI);
+        SIII->legalizeOperands(*OffsetAddMI);
+        OffsetOp->setReg(NewOffsetReg);
+        OffsetOp->setSubReg(0);
+        if (SlotIndexes)
+          SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
+      } else {
+        return false;
+      }
+    }
+    // Update subReg for users.
+    for (MachineOperand *UseMO : UseMOs) {
+      updateSubReg(*UseMO, NewRC, Offset, SIRI);
+    }
+  } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
+    // Clear subReg when it's a single 32-bit reg.
+    for (MachineOperand *UseMO : UseMOs) {
+      UseMO->setSubReg(0);
+    }
+  }
+
+  MI.setDesc(Desc);
+  // Mutate reg class of Reg.
+  MRI.setRegClass(Reg, NewRC);
+  return true;
+}
+
+bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
+                       const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                       SlotIndexes *SlotIndexes) {
+  bool IsImm = false;
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+    Register Reg = MI.getOperand(0).getReg();
+    if (!MRI.getUniqueVRegDef(Reg))
+      return false;
+    LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
+    LaneBitmask UseMask;
+    for (MachineOperand &MO : MRI.use_operands(Reg)) {
+      UseMask |= llvm::getRegMask(MO, MRI);
+    }
+
+    const unsigned FullMask = DstMask.getAsInteger();
+    unsigned Mask = UseMask.getAsInteger();
+    if (Mask == FullMask)
+      return false;
+    // Split mask when there's gap. Then group mask to 2/4/8.
+    auto Pieces = Piece::split(std::bitset<32>(Mask));
+    // Now only support 1 piece.
+    if (Pieces.size() != 1)
+      return false;
+    auto Piece = Pieces[0];
+    if (Piece.Size > 8)
+      return false;
+
+    // TODO: enable offset support when IsImm is true.
+    // Now if break different test when mul LaneSize or not mul for the offset.
+    if (IsImm && Piece.Offset != 0)
+      return false;
+
+    const unsigned Num32BitLanes =
+        Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI);
+
+    switch (Num32BitLanes) {
+    default:
+      return false;
+    case 1:
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+                                           : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 2:
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 3:
+      if (FullMask == 0xff)
+        return false;
+      LLVM_FALLTHROUGH;
+    case 4:
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 5:
+    case 6:
+    case 7:
+      if (FullMask == 0xffff)
+        return false;
+      LLVM_FALLTHROUGH;
+    case 8:
+      return reduceChannel(Piece.Offset, MI,
+                           SIII->get(IsImm
+                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+                                         : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    }
+
+  } break;
+  }
+  return false;
+}
+
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index c4452c91a43a8..6b9079e5d65fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -24,6 +24,7 @@ class LiveInterval;
 class SlotIndexes;
 class MachineRegisterInfo;
 class SIRegisterInfo;
+class SIInstrInfo;
 class MachineDominatorTree;
 class MachinePostDominatorTree;
 
@@ -45,6 +46,45 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
 using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
 
+bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
+                 llvm::MachineBasicBlock::iterator MI);
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags {
+  None = 0,        // No additional constraints.
+  NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+                   // insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
+    llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+    llvm::MachineRegisterInfo *MRI,
+    SccDefInsertPointConstraintFlags Constraints =
+        SccDefInsertPointConstraintFlags::None);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *TRI,
+                       const llvm::SIInstrInfo *TII,
+                       llvm::SlotIndexes *SlotIndexes);
+
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
   const llvm::MachineRegisterInfo &MRI,
   const llvm::SIRegisterInfo *SIRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 32301130606a7..c2dbf1a8b297e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -12,7 +12,158 @@
 //
 //==------------------------------------------------------------------------==//
 
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <cmath>
+
 namespace llvm {
+
+void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) {
+  unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1;
+  LatencyHide += LoopCount * S.LatencyHide;
+  MemLatency += LoopCount * S.MemLatency;
+  MixAlu += LoopCount * S.MixAlu;
+  Alu += LoopCount * S.Alu;
+  Lds += LoopCount * S.Lds;
+  SgprSpill |= S.SgprSpill;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+  unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc);
+  // 10% is good enough.
+  if ((10 * Gain) >= Alu)
+    return true;
+  return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+  unsigned Latency = MemLatency;
+  return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+    : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+  if (MI.isDebugInstr())
+    return;
+  int Latency = SIII->getInstrLatency(ItinerayData, MI);
+  // If inside latency hide.
+  if (!LatencyMIs.empty()) {
+    bool IsWaitCnt = false;
+    for (auto &MO : MI.operands()) {
+      if (MO.isReg()) {
+        Register Reg = MO.getReg();
+        auto It = LatencyMIs.find(Reg);
+        if (It != LatencyMIs.end()) {
+          IsWaitCnt = true;
+          // If MI use mem result, update latency to mem latency.
+          int Cycle = It->second;
+          if (Cycle > Latency)
+            Latency = Cycle;
+        }
+      }
+    }
+    // Update latency for each mem latency inst.
+    for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) {
+      auto Prev = It;
+      auto L = (It++);
+      int Cycle = L->second;
+      if (Cycle <= Latency) {
+        // Only left cycles.
+        // Remove the reg.
+        LatencyMIs.erase(Prev);
+        if (IsWaitCnt && Cycle == Latency) {
+          Score.MemLatency += Cycle;
+          // Only count memLatency once, the rest is hide.
+          IsWaitCnt = false;
+        } else {
+          // Hide cycle or count mem latency?
+          Score.LatencyHide += Cycle;
+        }
+      } else {
+        L->second -= Latency;
+        // Hide latency.
+        Score.LatencyHide += Latency;
+      }
+    }
+
+  } else {
+    // TODO: check branch/lds?
+    // TODO: check prevVAlu?
+    auto GetAluStatus = [](const MachineInstr &MI,
+                           const llvm::SIInstrInfo *SIII) {
+      AluStatus Status = AluStatus::Nothing;
+      if (SIII->isVALU(MI.getOpcode())) {
+        Status = AluStatus::Vector;
+      } else if (SIII->isSALU(MI.getOpcode())) {
+        Status = AluStatus::Scalar;
+      }
+      return Status;
+    };
+    AluStatus Status = GetAluStatus(MI, SIII);
+
+    switch (PrevStatus) {
+    case AluStatus::Nothing: {
+      Score.Alu += Latency;
+      Score.MixAlu += Latency;
+      PrevStatus = Status;
+    } break;
+    case AluStatus::Vector:
+    case AluStatus::Scalar: {
+      Score.Alu += Latency;
+      // Ignore mix alu.
+      if (PrevStatus != Status) {
+        PrevStatus = AluStatus::Nothing;
+      } else {
+        Score.MixAlu += Latency;
+      }
+    } break;
+    }
+  }
+  // Update latency inst.
+  if (SIII->isHighLatencyDef(MI.getOpcode()) && MI.mayLoad()) {
+    Register Reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kHighLetency = 180;
+    LatencyMIs[Reg] = kHighLetency;
+  } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+    Register Reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kLowLetency = 35;
+    LatencyMIs[Reg] = kLowLetency;
+  }
 }
 
 
+SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI) {
+  SchedScore TotalScore;
+  for (auto &MFI : MF) {
+    MachineBasicBlock &MBB = MFI;
+    MachineBasicBlock::iterator Next;
+    AMDGPULatencyTracker LatencyTracker(ST);
+    for (auto &MI : MBB) {
+      LatencyTracker.scan(MI);
+    }
+    unsigned LoopDepth = 0;
+    if (MLI) {
+      LoopDepth = MLI->getLoopDepth(&MBB);
+    }
+    TotalScore.sum(LatencyTracker.Score, LoopDepth);
+  }
+  return TotalScore;
+}
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index f9be0a2c73d86..b513e7335ffe4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,11 +15,16 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
 
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/ADT/DenseMap.h"
+
 namespace llvm {
 
+class MachineInstr;
 class MachineFunction;
 class GCNSubtarget;
 class MachineLoopInfo;
+class SIInstrInfo;
 
 struct SchedScore {
   // Score for this Sched result.
@@ -45,6 +50,28 @@ struct SchedScore {
   unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
 };
 
+struct AMDGPULatencyTracker {
+  AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+  const llvm::SIInstrInfo *SIII;
+  const llvm::InstrItineraryData *ItinerayData;
+  // Latency MI dst reg to cycle map.
+  llvm::DenseMap<unsigned, int> LatencyMIs;
+  SchedScore Score;
+  // Low latency MI not wait.
+  unsigned HideLatency = 0;
+  unsigned MemLatency = 0;
+  // For simple, only consider mixture as one valu one salu.
+  // Not group now.
+  unsigned PrevSAlu = 0;
+  unsigned PrevVAlu = 0;
+  enum class AluStatus {
+    Nothing,
+    Vector,
+    Scalar,
+  } PrevStatus = AluStatus::Nothing;
+  void scan(const llvm::MachineInstr &MI);
+};
+
 SchedScore collectLatency(llvm::MachineFunction &MF,
                           const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI = nullptr);

>From 3c2b1f3acd43503c7f90781784687cd473af09fc Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:29:26 -0700
Subject: [PATCH 03/11] Tests

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  13 +-
 .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ++++++++++++++++++
 .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 452 ++++++++++++++
 3 files changed, 1029 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 95237062a6093..5c628a89766c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -31,6 +31,8 @@
 
 using namespace llvm;
 
+static cl::opt<bool>
+    EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
 static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
 
 namespace {
@@ -723,6 +725,12 @@ int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
     if (IsSingleDef) {
       // The reg might share with other candidates,  check It here.
       // Count share reg in getReducedSize.
+      if (EnableAggressive) {
+        // In case of aggressive remat, treat multi use reg as shared reg and
+        // ignore size of shared reg.
+        if (!MRI.hasOneNonDBGUse(Reg))
+          continue;
+      }
       const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
       if (unsigned SubIdx = MO.getSubReg()) {
         if (OpRC)
@@ -1253,6 +1261,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
   unsigned SLimit = Status.TargetSLimit;
 
   int RematSCnt = Status.MaxSPressure - SLimit;
+  // when agressive sgpr remat, reserve some for allocation lost.
+  if (EnableAggressive)
+    RematSCnt += NearTargetRegLimit;
 
   bool IsSGPRSpill = false;
   if (RematSCnt > 0) {
@@ -1367,7 +1378,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
         for (RematNode &Node : SRematList) {
           SRematMap[Node.Reg] = Node;
           RematCnt += Node.Size;
-          if (RematCnt > RematSCnt)
+          if (RematCnt > RematSCnt && !EnableAggressive)
             break;
         }
         NewRematSCnt = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
new file mode 100644
index 0000000000000..02a9836313360
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
@@ -0,0 +1,565 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the buffer loads have been moved to the use and the lanes are reduced
+# correctly.
+#
+# CHECK: bb.2:
+#==========================================================================
+# X4_IMM, Using .x
+# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0
+# X4_IMM, Using .xy
+# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0
+# X4_IMM, Using .xyz
+# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0
+# X4_IMM, Using .yz
+# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0
+# X4_IMM, Using .yzw
+# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0
+#==========================================================================
+# X8_IMM, Using .x
+# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0
+# X8_IMM, Using .xy
+# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0
+# X8_IMM, Using .xyz
+# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0
+# X8_IMM, Using .xyzw
+# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0
+# X8_IMM, Using .xyzw + 5th dword
+# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0
+#==========================================================================
+# X16_IMM, Using .xy and .zw
+# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0
+#==========================================================================
+# X4_SGPR, Using .x
+# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0
+# X8_SGPR, Using .xy
+# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0
+# X16_SGPR, Using .xy + .zw
+# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0
+#==========================================================================
+#
+#
+# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3
+
+    ; X4_IMM
+    %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0
+    %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0
+    %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0
+    %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0
+    %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0
+
+    ; X8_IMM
+    %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0
+    %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0
+    %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0
+    %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0
+    %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0
+
+    ; X16_IMM
+    %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0
+
+    ; X4_SGPR
+    %50:sgpr_32 = COPY $sgpr0
+    %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0
+
+    ; X8_SGPR
+    %51:sgpr_32 = COPY $sgpr1
+    %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0
+
+    ; X16_SGPR
+    %52:sgpr_32 = COPY $sgpr2
+    %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0
+
+    %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0
+    %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0
+    %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0
+    %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0
+    %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0
+    %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0
+    %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0
+    %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0
+    %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0
+    %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0
+    %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0
+    %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0
+    %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0
+    %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0
+    %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0
+    %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0
+    %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0
+    %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0
+    %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0
+    %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0
+    %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0
+    %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0
+    %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0
+    %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0
+    %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0
+    %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0
+    %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0
+    %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0
+    %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0
+    %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0
+    %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0
+    %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0
+    %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0
+    %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0
+    %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0
+    %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0
+    %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0
+    %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0
+    %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0
+    %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0
+    %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0
+    %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0
+    %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0
+    %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0
+    %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0
+    %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0
+    %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0
+    %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0
+    %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0
+    %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0
+
+    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %8001:vgpr_32 = COPY %8000
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+    ;==========================================================================
+    ; X4_IMM, Using .x
+    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0
+    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses.
+
+    ; X4_IMM, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0
+    S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0
+
+    ; X4_IMM, Using .xyz
+    S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0
+    S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0
+    S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0
+
+    ; X4_IMM, Using .yz
+    S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0
+    S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0
+
+    ; X4_IMM, Using .yzw
+    S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0
+    S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0
+    S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0
+
+    ;==========================================================================
+    ; X8_IMM, Using .x
+    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0
+    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0
+    
+    ; X8_IMM, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0
+    S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0
+
+    ; X8_IMM, Using .xyz
+    S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0
+    S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0
+    S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0
+
+    ; X8_IMM, Using .xyzw
+    S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0
+    S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0
+    
+    ; X8_IMM, Using .xyzw + 5th dword
+    S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0
+    S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0
+
+    ;==========================================================================
+    ; X16_IMM, Using .xy and .zw
+    S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0
+    S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0
+
+    ;==========================================================================
+    ; X4_SGPR, Using .x
+    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0
+    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0
+
+    ; X8_SGPR, Using .xy
+    S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0
+    S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0
+
+    ; X16_SGPR, Using .xy + .zw
+    S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0
+    S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0
+
+    ;==========================================================================
+    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+
+
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 0000000000000..69875261b74e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,452 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.2:
+# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
+# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
+# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
+# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
+# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
+# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
+# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
+# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
+# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
+# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
+# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
+# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+    %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
+    %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
+    %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
+    %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
+    %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
+    %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
+    %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
+    %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
+    %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
+    %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
+    %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
+    %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
+    %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
+    %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
+    %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
+    %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
+    %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
+    %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
+    %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
+    %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
+    %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
+    %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
+    %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
+    %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
+    %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
+    %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
+    %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
+    %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
+    %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
+    %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
+    %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
+    %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
+    %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
+    %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
+    %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
+    %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
+    %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
+    %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
+    %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
+    %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
+    %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
+    %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
+    %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
+    %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
+    %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
+    %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
+    %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
+    %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
+    %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
+    %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
+    %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
+    %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
+    %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
+    %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
+    %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
+    %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
+    %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
+    %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
+    %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
+    %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
+    %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
+    %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
+    %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
+    %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
+
+    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %8001:vgpr_32 = COPY %8000
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+    S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...

>From dbdc9a48b78f7cc97f25d7e0195d1e5423d69265 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:59:28 -0700
Subject: [PATCH 04/11] Added test for the phi crash in pressure tracker

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp |  18 +-
 llvm/test/CodeGen/AMDGPU/remat/phi.mir    | 607 ++++++++++++++++++++++
 2 files changed, 618 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index f74d12cfab0c0..7f76d14eb9ab0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -549,22 +549,26 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
         if (!S.liveAt(SI)) {
           if (It == LiveRegs.end()) {
             It = LiveRegs.find(MO.getReg());
-            if (It == LiveRegs.end())
+            if (!MRI->isSSA() && It == LiveRegs.end())
               llvm_unreachable("register isn't live");
           }
-          auto PrevMask = It->second;
-          It->second &= ~S.LaneMask;
-          CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          if (It != LiveRegs.end()) {
+            auto PrevMask = It->second;
+            It->second &= ~S.LaneMask;
+            CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
+          }
         }
       }
       if (It != LiveRegs.end() && It->second.none())
         LiveRegs.erase(It);
     } else if (!LI.liveAt(SI)) {
       auto It = LiveRegs.find(MO.getReg());
-      if (It == LiveRegs.end())
+      if (!MRI->isSSA() && It == LiveRegs.end())
         llvm_unreachable("register isn't live");
-      CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
-      LiveRegs.erase(It);
+      if (It != LiveRegs.end()) {
+        CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
+        LiveRegs.erase(It);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
new file mode 100644
index 0000000000000..2d22e9fba2593
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
@@ -0,0 +1,607 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s  -amdgpu-remat-enable-hot-block-remat-aggressive  -run-pass=amdgpu-hot-block-remat -o - | FileCheck %s
+
+# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are
+# present.
+
+# CHECK: S_ENDPGM
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
+    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
+    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
+    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
+    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
+    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
+    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %3000:sgpr_32 = S_MOV_B32 0
+    %3001:sgpr_32 = S_MOV_B32 1
+    %3002:sgpr_32 = S_MOV_B32 2
+    %3003:sgpr_32 = S_MOV_B32 3
+    %3004:sgpr_32 = S_MOV_B32 4
+    %3005:sgpr_32 = S_MOV_B32 5
+    %3006:sgpr_32 = S_MOV_B32 6
+    %3007:sgpr_32 = S_MOV_B32 7
+    %3008:sgpr_32 = S_MOV_B32 8
+    %3009:sgpr_32 = S_MOV_B32 9
+    %3010:sgpr_32 = S_MOV_B32 10
+    %3011:sgpr_32 = S_MOV_B32 11
+    %3012:sgpr_32 = S_MOV_B32 12
+    %3013:sgpr_32 = S_MOV_B32 13
+    %3014:sgpr_32 = S_MOV_B32 14
+    %3015:sgpr_32 = S_MOV_B32 15
+    %3016:sgpr_32 = S_MOV_B32 16
+    %3017:sgpr_32 = S_MOV_B32 17
+    %3018:sgpr_32 = S_MOV_B32 18
+    %3019:sgpr_32 = S_MOV_B32 19
+    %3020:sgpr_32 = S_MOV_B32 20
+    %3021:sgpr_32 = S_MOV_B32 21
+    %3022:sgpr_32 = S_MOV_B32 22
+    %3023:sgpr_32 = S_MOV_B32 23
+    %3024:sgpr_32 = S_MOV_B32 24
+    %3025:sgpr_32 = S_MOV_B32 25
+    %3026:sgpr_32 = S_MOV_B32 26
+    %3027:sgpr_32 = S_MOV_B32 27
+    %3028:sgpr_32 = S_MOV_B32 28
+    %3029:sgpr_32 = S_MOV_B32 29
+    %3030:sgpr_32 = S_MOV_B32 30
+    %3031:sgpr_32 = S_MOV_B32 31
+    %3032:sgpr_32 = S_MOV_B32 32
+    %3033:sgpr_32 = S_MOV_B32 33
+    %3034:sgpr_32 = S_MOV_B32 34
+    %3035:sgpr_32 = S_MOV_B32 35
+    %3036:sgpr_32 = S_MOV_B32 36
+    %3037:sgpr_32 = S_MOV_B32 37
+    %3038:sgpr_32 = S_MOV_B32 38
+    %3039:sgpr_32 = S_MOV_B32 39
+    %3040:sgpr_32 = S_MOV_B32 40
+    %3041:sgpr_32 = S_MOV_B32 41
+    %3042:sgpr_32 = S_MOV_B32 42
+    %3043:sgpr_32 = S_MOV_B32 43
+    %3044:sgpr_32 = S_MOV_B32 44
+    %3045:sgpr_32 = S_MOV_B32 45
+    %3046:sgpr_32 = S_MOV_B32 46
+    %3047:sgpr_32 = S_MOV_B32 47
+    %3048:sgpr_32 = S_MOV_B32 48
+    %3049:sgpr_32 = S_MOV_B32 49
+    %3050:sgpr_32 = S_MOV_B32 50
+    %3051:sgpr_32 = S_MOV_B32 51
+    %3052:sgpr_32 = S_MOV_B32 52
+    %3053:sgpr_32 = S_MOV_B32 53
+    %3054:sgpr_32 = S_MOV_B32 54
+    %3055:sgpr_32 = S_MOV_B32 55
+    %3056:sgpr_32 = S_MOV_B32 56
+    %3057:sgpr_32 = S_MOV_B32 57
+    %3058:sgpr_32 = S_MOV_B32 58
+    %3059:sgpr_32 = S_MOV_B32 59
+    %3060:sgpr_32 = S_MOV_B32 60
+    %3061:sgpr_32 = S_MOV_B32 61
+    %3062:sgpr_32 = S_MOV_B32 62
+    %3063:sgpr_32 = S_MOV_B32 63
+    %3064:sgpr_32 = S_MOV_B32 64
+    %3065:sgpr_32 = S_MOV_B32 65
+    %3066:sgpr_32 = S_MOV_B32 66
+    %3067:sgpr_32 = S_MOV_B32 67
+    %3068:sgpr_32 = S_MOV_B32 68
+    %3069:sgpr_32 = S_MOV_B32 69
+    %3070:sgpr_32 = S_MOV_B32 70
+    %3071:sgpr_32 = S_MOV_B32 71
+    %3072:sgpr_32 = S_MOV_B32 72
+    %3073:sgpr_32 = S_MOV_B32 73
+    %3074:sgpr_32 = S_MOV_B32 74
+    %3075:sgpr_32 = S_MOV_B32 75
+    %3076:sgpr_32 = S_MOV_B32 76
+    %3077:sgpr_32 = S_MOV_B32 77
+    %3078:sgpr_32 = S_MOV_B32 78
+    %3079:sgpr_32 = S_MOV_B32 79
+    %3080:sgpr_32 = S_MOV_B32 80
+    %3081:sgpr_32 = S_MOV_B32 81
+    %3082:sgpr_32 = S_MOV_B32 82
+    %3083:sgpr_32 = S_MOV_B32 83
+    %3084:sgpr_32 = S_MOV_B32 84
+    %3085:sgpr_32 = S_MOV_B32 85
+    %3086:sgpr_32 = S_MOV_B32 86
+    %3087:sgpr_32 = S_MOV_B32 87
+    %3088:sgpr_32 = S_MOV_B32 88
+    %3089:sgpr_32 = S_MOV_B32 89
+    %3090:sgpr_32 = S_MOV_B32 90
+    %3091:sgpr_32 = S_MOV_B32 91
+    %3092:sgpr_32 = S_MOV_B32 92
+    %3093:sgpr_32 = S_MOV_B32 93
+    %3094:sgpr_32 = S_MOV_B32 94
+    %3095:sgpr_32 = S_MOV_B32 95
+    %3096:sgpr_32 = S_MOV_B32 96
+    %3097:sgpr_32 = S_MOV_B32 97
+    %3098:sgpr_32 = S_MOV_B32 98
+    %3099:sgpr_32 = S_MOV_B32 99
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+    %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1
+    %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1
+    %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1
+    %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1
+    %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1
+    %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1
+    %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1
+    %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1
+    %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1
+    %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1
+    %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1
+    %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1
+    %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1
+    %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1
+    %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1
+    %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1
+    %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1
+    %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1
+    %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1
+    %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1
+    %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1
+    %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1
+    %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1
+    %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1
+    %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1
+    %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1
+    %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1
+    %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1
+    %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1
+    %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1
+    %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1
+    %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1
+    %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1
+    %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1
+    %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1
+    %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1
+    %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1
+    %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1
+    %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1
+    %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1
+    %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1
+    %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1
+    %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1
+    %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1
+    %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1
+    %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1
+    %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1
+    %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1
+    %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1
+    %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1
+    %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1
+    %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1
+    %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1
+    %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1
+    %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1
+    %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1
+    %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1
+    %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1
+    %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1
+    %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1
+    %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1
+    %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1
+    %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1
+    %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1
+    %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1
+    %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1
+    %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1
+    %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1
+    %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1
+    %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1
+    %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1
+    %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1
+    %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1
+    %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1
+    %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1
+    %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1
+    %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1
+    %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1
+    %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1
+    %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1
+    %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1
+    %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1
+    %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1
+    %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1
+    %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1
+    %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1
+    %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1
+    %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1
+    %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1
+    %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1
+    %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1
+    %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1
+    %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1
+    %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1
+    %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1
+    %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1
+    %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1
+    %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1
+    %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1
+    %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1
+
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec
+    %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec
+    %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec
+    %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec
+    %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec
+    %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec
+    %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec
+    %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec
+    %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec
+    %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec
+    %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec
+    %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec
+    %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec
+    %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec
+    %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec
+    %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec
+    %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec
+    %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec
+    %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec
+    %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec
+    %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec
+    %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec
+    %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec
+    %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec
+    %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec
+    %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec
+    %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec
+    %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec
+    %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec
+    %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec
+    %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec
+    %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec
+    %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec
+    %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec
+    %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec
+    %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec
+    %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec
+    %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec
+    %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec
+    %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec
+    %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec
+    %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec
+    %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec
+    %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec
+    %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec
+    %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec
+    %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec
+    %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec
+    %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec
+    %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec
+    %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec
+    %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec
+    %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec
+    %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec
+    %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec
+    %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec
+    %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec
+    %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec
+    %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec
+    %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec
+    %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec
+    %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec
+    %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec
+    %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec
+    %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec
+    %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec
+    %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec
+    %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec
+    %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec
+    %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec
+    %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec
+    %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec
+    %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec
+    %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec
+    %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec
+    %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec
+    %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec
+    %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec
+    %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec
+    %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec
+    %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec
+    %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec
+    %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec
+    %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec
+    %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec
+    %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec
+    %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec
+    %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec
+    %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec
+    %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec
+    %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec
+    %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec
+    %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec
+    %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec
+    %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec
+    %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec
+    %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec
+    %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec
+    %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec
+    %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec
+    EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    

>From d4fd382d1a23303d1804c3169a589f2aa55a58b4 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:59:36 -0700
Subject: [PATCH 05/11] clang format

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 69 +++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 14 ++--
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  6 +-
 .../AMDGPUOccupancyAndLatencyHelper.cpp       |  5 +-
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |  4 +-
 5 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 5c628a89766c3..3c5d592602c6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -12,20 +12,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "AMDGPUMIRUtils.h"
 #include "AMDGPUOccupancyAndLatencyHelper.h"
-#include "AMDGPU.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "GCNRegPressure.h"
 
 #define DEBUG_TYPE "amdgpu-hot-block-remat"
 
@@ -111,19 +111,18 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void applyCloneRemat(RematNode &Node,
-    std::vector<BlockLiveInfo> &HotBlocks,
-    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
-    SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, MachineFunction &MF);
+  void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+                       MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+                       SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+                       const SIInstrInfo *SIII, MachineFunction &MF);
   void applyRemat(MapVector<Register, RematNode> &RematMap,
-    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
-    llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-    MachineFunction &MF);
+                  std::vector<BlockLiveInfo> &HotBlocks,
+                  MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
+                  MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  const SIInstrInfo *SIII, MachineFunction &MF);
   bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
-    LiveIntervals *LIS, MachineDominatorTree *DT,
-    MachinePostDominatorTree *PDT, bool &IsNearTarget);
+                     LiveIntervals *LIS, MachineDominatorTree *DT,
+                     MachinePostDominatorTree *PDT, bool &IsNearTarget);
 
   StringRef getPassName() const override { return "AMDGPU rematerialize"; }
 
@@ -237,11 +236,11 @@ void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
   }
 }
 
-void AMDGPUHotBlockRematerialize::applyCloneRemat(RematNode &Node,
-                     std::vector<BlockLiveInfo> &HotBlocks,
-                     MachineDominatorTree *DT, MachineRegisterInfo &MRI,
-                     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
-                     const SIInstrInfo *SIII, MachineFunction &MF) {
+void AMDGPUHotBlockRematerialize::applyCloneRemat(
+    RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
+    MachineDominatorTree *DT, MachineRegisterInfo &MRI,
+    SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, MachineFunction &MF) {
   unsigned Reg = Node.Reg;
 
   MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
@@ -359,11 +358,11 @@ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
   SlotIndexes->insertMachineInstrInMaps(*DefMI);
 }
 
-void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &RematMap,
-                std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
-                llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
-                const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-                MachineFunction &MF) {
+void AMDGPUHotBlockRematerialize::applyRemat(
+    MapVector<Register, RematNode> &RematMap,
+    std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
+    llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
   std::vector<RematNode> UpdateList;
   for (auto &It : RematMap) {
     UpdateList.emplace_back(It.second);
@@ -381,8 +380,7 @@ void AMDGPUHotBlockRematerialize::applyRemat(MapVector<Register, RematNode> &Rem
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
     } else if (Node.Kind == RematNode::RematKind::Clone) {
-      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII,
-                      MF);
+      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
     }
   }
 }
@@ -1234,9 +1232,12 @@ void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
   dbgs() << "Total Size:" << TotalSize << "\n";
 }
 
-bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
-                   LiveIntervals *LIS, MachineDominatorTree *DT,
-                   MachinePostDominatorTree *PDT, bool &IsNearTarget) {
+bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
+                                                MachineLoopInfo *MLI,
+                                                LiveIntervals *LIS,
+                                                MachineDominatorTree *DT,
+                                                MachinePostDominatorTree *PDT,
+                                                bool &IsNearTarget) {
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
 
   const SIInstrInfo *SIII = ST->getInstrInfo();
@@ -1489,8 +1490,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, MachineLoop
 
   if (!SRematMap.empty()) {
     IsUpdated = true;
-    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII,
-               MF);
+    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
 
@@ -1530,4 +1530,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
 FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
   return new AMDGPUHotBlockRematerialize();
 }
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 6d6bd38c61c06..dfb90e5545c8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMIRUtils.h"
-#include "SIRegisterInfo.h"
 #include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
 
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 #define DEBUG_TYPE "xb-mir-util"
 using namespace llvm;
@@ -101,11 +101,10 @@ bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
 
 } // namespace
 
-
 namespace llvm {
 
 bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
-                       llvm::MachineBasicBlock::iterator MI) {
+                 llvm::MachineBasicBlock::iterator MI) {
   const TargetRegisterInfo *TRI =
       MBB->getParent()->getRegInfo().getTargetRegisterInfo();
   for (auto It = MI; It != MBB->end(); ++It) {
@@ -205,9 +204,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
 // TouchedMBBSet is used for scheduling where local live interval could cross
 // multiple regions, need to calculate livereg for each region inside touched
 // MBB.
-bool isLocalLiveInterval(
-    const LiveInterval &LI, SlotIndexes *Indexes,
-    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
+bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
+                         SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges()) {
       if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 6b9079e5d65fb..2470e2bed482f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -86,8 +86,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
                        llvm::SlotIndexes *SlotIndexes);
 
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
-  const llvm::MachineRegisterInfo &MRI,
-  const llvm::SIRegisterInfo *SIRI);
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI);
 void collectLiveSetPressure(const LiveSet &LiveSet,
                             const llvm::MachineRegisterInfo &MRI,
                             const llvm::SIRegisterInfo *SIRI,
@@ -97,6 +97,6 @@ bool reach_block(llvm::MachineBasicBlock *FromBB,
                  llvm::MachineDominatorTree *DT,
                  llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
                  llvm::MachineBasicBlock *ToBB);
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index c2dbf1a8b297e..5c2b7904c46be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -16,8 +16,8 @@
 #include "GCNSubtarget.h"
 #include "SIInstrInfo.h"
 
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
 #include <cmath>
@@ -144,7 +144,6 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
   }
 }
 
-
 SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI) {
   SchedScore TotalScore;
@@ -165,5 +164,3 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
 }
 
 } // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index b513e7335ffe4..e30df0d457863 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H
 
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInstrItineraries.h"
 
 namespace llvm {
 
@@ -76,5 +76,5 @@ SchedScore collectLatency(llvm::MachineFunction &MF,
                           const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI = nullptr);
 
-}
+} // namespace llvm
 #endif

>From 4f7d0dad93c64d94667e74dbd80fdabed3146144 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Tue, 22 Apr 2025 11:54:29 -0700
Subject: [PATCH 06/11] LLVM Style

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 87 +++++++------------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 59 +++++--------
 .../AMDGPUOccupancyAndLatencyHelper.cpp       | 16 ++--
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |  5 --
 4 files changed, 60 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 3c5d592602c6f..e165b83b18850 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -176,21 +176,17 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     bool IsDomAllHotBlocks = true;
     bool IsDomedByAllHotBlocks = true;
     for (MachineBasicBlock *HotMBB : HotBlockSet) {
-      if (!DT->dominates(MBB, HotMBB)) {
+      if (!DT->dominates(MBB, HotMBB))
         IsDomAllHotBlocks = false;
-      }
-      if (!DT->dominates(HotMBB, MBB)) {
+      if (!DT->dominates(HotMBB, MBB))
         IsDomedByAllHotBlocks = false;
-      }
-      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) {
+      if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks)
         break;
-      }
     }
-    if (IsDomAllHotBlocks) {
+    if (IsDomAllHotBlocks)
       UserBlocks.erase(MBB);
-    } else if (IsDomedByAllHotBlocks) {
+    else if (IsDomedByAllHotBlocks)
       AfterHotRangeMBBs.insert(MBB);
-    }
   }
 
   // Split after hotRange block set by domtree.
@@ -274,18 +270,16 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
   for (auto UseIt : UserMap) {
     MachineBasicBlock *MBB = UseIt.first;
     // Skip same block uses.
-    if (MBB == DefMI->getParent()) {
+    if (MBB == DefMI->getParent())
       continue;
-    }
     // Skip MBB which share clone from other MBBs.
     if (UserMBBSet.count(MBB) == 0)
       continue;
 
     Register NewReg = MRI.createVirtualRegister(RC);
     auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
-    for (unsigned I = 1; I < OpNum; I++) {
+    for (unsigned I = 1; I < OpNum; I++)
       NewDef = NewDef.add(DefMI->getOperand(I));
-    }
 
     MachineInstr *InsertPointMI = UseIt.second.front();
     SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
@@ -364,9 +358,9 @@ void AMDGPUHotBlockRematerialize::applyRemat(
     llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
   std::vector<RematNode> UpdateList;
-  for (auto &It : RematMap) {
+  for (auto &It : RematMap)
     UpdateList.emplace_back(It.second);
-  }
+
   // Sort update list with slotIndex to make sure def moved before use.
   // If use moved before def, It might not be the first use anymore.
   std::sort(UpdateList.begin(), UpdateList.end(),
@@ -377,11 +371,10 @@ void AMDGPUHotBlockRematerialize::applyRemat(
             });
 
   for (RematNode &Node : UpdateList) {
-    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse)
       applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
-    } else if (Node.Kind == RematNode::RematKind::Clone) {
+    else if (Node.Kind == RematNode::RematKind::Clone)
       applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
-    }
   }
 }
 
@@ -410,12 +403,10 @@ unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
 
   GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
   unsigned SPressure = RP.getMaxSGPR();
-  if (SPressure > MaxSPressure) {
+  if (SPressure > MaxSPressure)
     MaxSPressure = SPressure;
-  }
-  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) {
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure)
     MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
-  }
   Status.MBBPressureMap[&MBB] = RP;
   return RP.getOccupancy(*ST);
 }
@@ -573,9 +564,8 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
   unsigned SInputPressure = 0;
   uint64_t Mask = 0xf;
   while (Mask != 0) {
-    if (Mask & SInputMask) {
+    if (Mask & SInputMask)
       SInputPressure += 4;
-    }
     Mask = Mask << 4;
   }
 
@@ -670,9 +660,8 @@ void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
       // still before LiveInfo.BB, It is still live.
       unsigned LiveBBIndex = RPOTIndexMap[CurBB];
       unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
-      if (LiveBBIndex > InsertBBIndex) {
+      if (LiveBBIndex > InsertBBIndex)
         continue;
-      }
     }
     // Already in remat map, don't need to check again, remove from
     // candidate.
@@ -978,11 +967,10 @@ void buildRematCandiates(std::vector<RematNode> &Candidates,
 
     if (IsSafeCandidate) {
       int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
-      if (Gain > 0) {
+      if (Gain > 0)
         Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
-      } else {
+      else
         IsSafeCandidate = false;
-      }
     }
     // Save unsafe reg.
     if (!IsSafeCandidate)
@@ -1056,9 +1044,9 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
   // Work one def one use first.
   for (auto &Node : Candidates) {
     unsigned Reg = Node.Reg;
-    if (!MRI.hasOneNonDBGUse(Reg)) {
+    if (!MRI.hasOneNonDBGUse(Reg))
       continue;
-    }
+
     MachineInstr *DefMI = Node.DefMI;
     if (!isSafeToMove(DefMI, MRI)) {
       PinnedRegSet.insert(Reg);
@@ -1074,9 +1062,9 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
     // Try multi use case.
     for (auto &Node : Candidates) {
       unsigned Reg = Node.Reg;
-      if (MRI.hasOneNonDBGUse(Reg)) {
+      if (MRI.hasOneNonDBGUse(Reg))
         continue;
-      }
+
       MachineInstr *DefMI = Node.DefMI;
       if (!isSafeToMove(DefMI, MRI)) {
         PinnedRegSet.insert(Reg);
@@ -1161,10 +1149,9 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
       if (!Reg.isVirtual())
         continue;
 
-      if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) {
+      if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg()))
         // Not support mix of v and s when remat now.
         continue;
-      }
 
       const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
       int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
@@ -1245,9 +1232,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
 
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
-  for (MachineBasicBlock *MBB : RPOT) {
+  for (MachineBasicBlock *MBB : RPOT)
     RPOTIndexMap[MBB] = RPOTIndexMap.size();
-  }
 
   auto &MRI = MF.getRegInfo();
 
@@ -1267,9 +1253,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     RematSCnt += NearTargetRegLimit;
 
   bool IsSGPRSpill = false;
-  if (RematSCnt > 0) {
+  if (RematSCnt > 0)
     IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
-  }
 
   const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
 
@@ -1354,9 +1339,9 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     int RematSCnt = MaxSPressure - SReduced - SLimit;
 
     bool IsSGPRSpill = false;
-    if (RematSCnt > 0) {
+    if (RematSCnt > 0)
       IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF);
-    }
+
     bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
     // Try to add candidates into remat list.
 
@@ -1393,15 +1378,13 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
             getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
         if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
             RematSCnt) {
-          for (RematNode &Node : SRematList) {
+          for (RematNode &Node : SRematList)
             SRematMap[Node.Reg] = Node;
-          }
         } else {
           if (!IsForceRematSgpr)
             return false;
-          for (RematNode &Node : SRematList) {
+          for (RematNode &Node : SRematList)
             SRematMap[Node.Reg] = Node;
-          }
           // Find local one def one use candidates.
           for (MachineInstr &MI : *MBB) {
             if (MI.isDebugInstr())
@@ -1425,9 +1408,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
                                  /*IsVGPR*/ false);
             if (Gain > 0) {
               // Skip case when DefMI has implicit define which used by UseMI.
-              if (isImplicitDefUse(&MI, &UseMI)) {
+              if (isImplicitDefUse(&MI, &UseMI))
                 continue;
-              }
               RematNode Node = {Reg, &MI, (unsigned)Gain >> 5};
               Node.InsertPointMI = &UseMI;
               Node.Kind = RematNode::RematKind::OneDefOneUse;
@@ -1459,19 +1441,16 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     bool IsVRematOK =
         (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty();
     if (NeedSRemat && NeedVRemat) {
-      if (IsVRematOK && IsSRematOK) {
+      if (IsVRematOK && IsSRematOK)
         IsUpdated = true;
-      } else if (IsSGPRSpill) {
+      else if (IsSGPRSpill)
         IsUpdated = true;
-      }
     } else if (NeedSRemat) {
-      if (IsSRematOK) {
+      if (IsSRematOK)
         IsUpdated = true;
-      }
     } else if (NeedVRemat) {
-      if (IsVRematOK) {
+      if (IsVRematOK)
         IsUpdated = true;
-      }
     }
     // TODO: what to do when cannot reach target?
     if (NewRematSCnt > 0) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index dfb90e5545c8e..afa1a8853938f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -140,9 +140,8 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
     MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
   // If SCC is dead at MI when we can use MI as the insert point.
-  if (!llvm::isSccLiveAt(MBB, MI)) {
+  if (!llvm::isSccLiveAt(MBB, MI))
     return MI;
-  }
 
   const bool CheckForExecWrite =
       Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
@@ -150,11 +149,10 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
   // Get the starting reverse iterator taking care to handle the MBB->end()
   // case.
   MachineBasicBlock::reverse_iterator Start;
-  if (MI == MBB->end()) {
+  if (MI == MBB->end())
     Start = MBB->rbegin();
-  } else {
+  else
     Start = MI.getReverse();
-  }
 
   // Otherwise, walk backwards through the block looking for a location where
   // SCC is dead.
@@ -164,14 +162,12 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     // an insertion point (if that is a constraint from the caller).
     // The check for EXEC works for both wave64 and wave32 because
     // it will also catch Writes to the subregisters (e.g. exec_lo).
-    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
       break;
-    }
 
     if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
-        !It->readsRegister(AMDGPU::SCC, TRI)) {
+        !It->readsRegister(AMDGPU::SCC, TRI))
       return It->getIterator();
-    }
   }
 
   // If no safe location can be found in the block we can save and restore
@@ -207,20 +203,18 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
 bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
                          SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
   if (LI.hasSubRanges()) {
-    for (const auto &S : LI.subranges()) {
+    for (const auto &S : LI.subranges())
       if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
         return false;
-    }
   }
   return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
 }
 
 bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
   if (LI.hasSubRanges()) {
-    for (const auto &S : LI.subranges()) {
+    for (const auto &S : LI.subranges())
       if (!isLocalLiveRange(&S, Indexes))
         return false;
-    }
   }
   return isLocalLiveRange(&LI, Indexes);
 }
@@ -231,9 +225,8 @@ void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
   for (auto It : LiveSet) {
     int Reg = It.first;
     dbgs() << printReg(Reg, SIRI);
-    if (It.second.any()) {
+    if (It.second.any())
       dbgs() << " mask:" << It.second.getAsInteger();
-    }
     dbgs() << "\n";
   }
 }
@@ -405,15 +398,13 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
                    const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
   MachineOperand &DstMO = MI.getOperand(0);
   // Skip case when dst subReg not 0.
-  if (DstMO.getSubReg()) {
+  if (DstMO.getSubReg())
     return false;
-  }
   Register Reg = DstMO.getReg();
 
   SmallVector<MachineOperand *, 2> UseMOs;
-  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg))
     UseMOs.emplace_back(&UseMO);
-  }
 
   const llvm::TargetRegisterClass *NewRC =
       SIRI->getRegClass(Desc.operands().front().RegClass);
@@ -441,9 +432,8 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
         assert(OffsetOp != nullptr);
         int64_t Offset = OffsetOp->getImm();
         Offset += Offset * LaneSize;
-        if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+        if (!SIII->isLegalMUBUFImmOffset(Offset))
           return false;
-        }
         OffsetOp->setImm(Offset);
       } else {
         return false;
@@ -473,14 +463,12 @@ bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
       }
     }
     // Update subReg for users.
-    for (MachineOperand *UseMO : UseMOs) {
+    for (MachineOperand *UseMO : UseMOs)
       updateSubReg(*UseMO, NewRC, Offset, SIRI);
-    }
   } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
     // Clear subReg when it's a single 32-bit reg.
-    for (MachineOperand *UseMO : UseMOs) {
+    for (MachineOperand *UseMO : UseMOs)
       UseMO->setSubReg(0);
-    }
   }
 
   MI.setDesc(Desc);
@@ -511,9 +499,8 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
       return false;
     LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
     LaneBitmask UseMask;
-    for (MachineOperand &MO : MRI.use_operands(Reg)) {
+    for (MachineOperand &MO : MRI.use_operands(Reg))
       UseMask |= llvm::getRegMask(MO, MRI);
-    }
 
     const unsigned FullMask = DstMask.getAsInteger();
     unsigned Mask = UseMask.getAsInteger();
@@ -602,11 +589,10 @@ void collectLiveSetPressure(const LiveSet &LiveSet,
   for (auto LiveIt : LiveSet) {
     unsigned Reg = LiveIt.first;
     unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI);
-    if (SIRI->isVGPR(MRI, Reg)) {
+    if (SIRI->isVGPR(MRI, Reg))
       VPressure += Size;
-    } else {
+    else
       SPressure += Size;
-    }
   }
 }
 
@@ -651,21 +637,18 @@ bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
 bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
                  MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
                  MachineBasicBlock *ToBB) {
-  if (FromBB == ToBB) {
+  if (FromBB == ToBB)
     return true;
-  }
 
-  if (DT->dominates(FromBB, ToBB)) {
+  if (DT->dominates(FromBB, ToBB))
     return true;
-  }
 
-  if (PDT->dominates(ToBB, FromBB)) {
+  if (PDT->dominates(ToBB, FromBB))
     return true;
-  }
 
-  if (loopContainsBoth(LI, ToBB, FromBB)) {
+  if (loopContainsBoth(LI, ToBB, FromBB))
     return true;
-  }
+
   // TODO: cover case hotBB in loop,
   //       one block in that loop dom BB or
   //       BB post dom one block in that loop.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 5c2b7904c46be..6160fe5471376 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -101,11 +101,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
     auto GetAluStatus = [](const MachineInstr &MI,
                            const llvm::SIInstrInfo *SIII) {
       AluStatus Status = AluStatus::Nothing;
-      if (SIII->isVALU(MI.getOpcode())) {
+      if (SIII->isVALU(MI.getOpcode()))
         Status = AluStatus::Vector;
-      } else if (SIII->isSALU(MI.getOpcode())) {
+      else if (SIII->isSALU(MI.getOpcode()))
         Status = AluStatus::Scalar;
-      }
       return Status;
     };
     AluStatus Status = GetAluStatus(MI, SIII);
@@ -120,11 +119,10 @@ void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
     case AluStatus::Scalar: {
       Score.Alu += Latency;
       // Ignore mix alu.
-      if (PrevStatus != Status) {
+      if (PrevStatus != Status)
         PrevStatus = AluStatus::Nothing;
-      } else {
+      else
         Score.MixAlu += Latency;
-      }
     } break;
     }
   }
@@ -151,13 +149,11 @@ SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
     MachineBasicBlock &MBB = MFI;
     MachineBasicBlock::iterator Next;
     AMDGPULatencyTracker LatencyTracker(ST);
-    for (auto &MI : MBB) {
+    for (auto &MI : MBB)
       LatencyTracker.scan(MI);
-    }
     unsigned LoopDepth = 0;
-    if (MLI) {
+    if (MLI)
       LoopDepth = MLI->getLoopDepth(&MBB);
-    }
     TotalScore.sum(LatencyTracker.Score, LoopDepth);
   }
   return TotalScore;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index e30df0d457863..9c63fa7e6b4a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -39,12 +39,7 @@ struct SchedScore {
   unsigned Lds = 0; // Todo: count lds.
   SchedScore() {}
 
-  // Other info which can help compare schedule result.
-  float computeScore() const;
-  float computeScore2() const;
-
   void sum(const SchedScore &S, unsigned LoopDepth = 0);
-  bool isBetter(const SchedScore &S) const;
   bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
   // More latency can be hiden with ExtraOcc.
   unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;

>From 3cb2c8d067cf8c106702ab2807ba749c4e47e848 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Thu, 1 May 2025 12:45:37 -0700
Subject: [PATCH 07/11] Avoid duplicate shadow variable names

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index e165b83b18850..91371e0f5fe55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -1247,21 +1247,24 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   unsigned VLimit = Status.TargetVLimit;
   unsigned SLimit = Status.TargetSLimit;
 
-  int RematSCnt = Status.MaxSPressure - SLimit;
-  // when agressive sgpr remat, reserve some for allocation lost.
-  if (EnableAggressive)
-    RematSCnt += NearTargetRegLimit;
-
-  bool IsSGPRSpill = false;
-  if (RematSCnt > 0)
-    IsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
-
-  const bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance;
-
-  // If bound by lds, skip.
-  if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
-      !IsForceRematSgpr)
-    return false;
+  // Early check for
+  {
+    int InitialRematSCnt = Status.MaxSPressure - SLimit;
+    // when agressive sgpr remat, reserve some for allocation lost.
+    if (EnableAggressive)
+      InitialRematSCnt += NearTargetRegLimit;
+
+    bool InitialIsSGPRSpill = false;
+    if (InitialRematSCnt > 0)
+      InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
+
+    const bool InitialIsForceRematSgpr = InitialIsSGPRSpill || Status.NotBalance;
+
+    // If bound by lds, skip.
+    if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
+      !InitialIsForceRematSgpr)
+      return false;
+  }
 
   MachineBasicBlock *EntryMBB = &MF.front();
 
@@ -1277,6 +1280,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     MachineBasicBlock *MBB = *It;
     auto &RP = Status.MBBPressureMap[MBB];
     // ignore block not hot.
+
     if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit &&
         (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) <
             Status.TargetSLimit)

>From 0775bb87d6739918a0b3d0cee85e7aefc0d1f220 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Fri, 2 May 2025 15:48:48 -0700
Subject: [PATCH 08/11] Big cleanup to clarify the flow of data, the purpose of
 functions, etc

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 241 ++++++++++--------
 1 file changed, 140 insertions(+), 101 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 91371e0f5fe55..9aa52ac1cf69e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -53,7 +53,7 @@ struct RematNode {
   RematNode(unsigned R, MachineInstr *MI, unsigned S)
       : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr),
         Kind(RematKind::Candidate), Size(S) {}
-  unsigned Reg;
+  Register Reg;
   MachineInstr *DefMI;
   MachineBasicBlock *InsertBlock;
   union {
@@ -61,7 +61,7 @@ struct RematNode {
     unsigned UserCount;
   };
   RematKind Kind;
-  unsigned Size;
+  unsigned Size; // This is actually the Gain of the candidate.
 };
 
 struct BlockLiveInfo {
@@ -152,7 +152,7 @@ MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
 }
 
 DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
-    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
+    Register Reg, BlockMap<SmallVector<MachineInstr *, 2>> &UserBlocks,
     DenseSet<MachineBasicBlock *> &UserMBBSet,
     std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT) {
   // Collect hot blocks which Exp is live in.
@@ -217,7 +217,7 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
   return DomMap;
 }
 
-void updateUsers(unsigned Reg, unsigned NewReg, bool IsSubRegDef,
+void updateUsers(Register Reg, unsigned NewReg, bool IsSubRegDef,
                  SmallVector<MachineInstr *, 2> &UserMIs) {
   for (MachineInstr *UseMI : UserMIs) {
     for (MachineOperand &MO : UseMI->operands()) {
@@ -237,20 +237,16 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
     MachineDominatorTree *DT, MachineRegisterInfo &MRI,
     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
     const SIInstrInfo *SIII, MachineFunction &MF) {
-  unsigned Reg = Node.Reg;
-
+  Register Reg = Node.Reg;
   MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
-  auto DefOp = DefMI->getOperand(0);
+
   const MCInstrDesc &Desc = DefMI->getDesc();
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  // When the unique def has subReg, just create newReg for the subReg part.
-  bool IsSubRegDef = false;
-  if (DefOp.getSubReg() != 0) {
-    RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
-    IsSubRegDef = true;
-  }
-  const DebugLoc DL = DefMI->getDebugLoc();
-  unsigned OpNum = DefMI->getNumOperands();
+  const TargetRegisterClass *RC =
+      SIRI->getAllocatableClass(SIII->getOpRegClass(*DefMI, 0));
+  const bool IsSubRegDef = DefMI->getOperand(0).getSubReg() != 0;
+
+  const DebugLoc &DL = DefMI->getDebugLoc();
+  const unsigned OpNum = DefMI->getNumOperands();
 
   Node.Kind = RematNode::RematKind::Clone;
 
@@ -550,7 +546,7 @@ RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
     const Register Reg = Livein.first;
     const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
     assert(Reg.isPhysical() && "input must be physical reg");
-    unsigned RegSize = RC->getLaneMask().getNumLanes();
+    Register RegSize = RC->getLaneMask().getNumLanes();
     if (SIRI->isVGPR(MRI, Reg)) {
       VInputPressure += RegSize;
     } else {
@@ -621,8 +617,11 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
   return false;
 }
 
-// SGPR has alignment requirment, cannot get accurate reg number.
-const unsigned NearTargetRegLimit = 10;
+static unsigned AlignToSgprAllocationGranularity(const GCNSubtarget *ST,
+                                                 unsigned SgprCount) {
+  return llvm::alignTo(SgprCount, ST->getSGPRAllocGranule());
+}
+
 bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
                    MachineFunction &MF) {
   unsigned MaxSGPR = ST->getAddressableNumSGPRs();
@@ -638,13 +637,13 @@ bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
 }
 
 // Skip live reg remated to other block.
-void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
-                    GCNRPTracker::LiveRegSet &LiveSet,
-                    const GCNRPTracker::LiveRegSet &InputLive,
-                    MachineBasicBlock *CurBB,
-                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+void updateLiveInfo(
+    const MapVector<Register, RematNode> &RematMap,
+    GCNRPTracker::LiveRegSet &LiveSet,
+    const GCNRPTracker::LiveRegSet &InputLive, const MachineBasicBlock *CurBB,
+    DenseMap<const MachineBasicBlock *, unsigned> &RPOTIndexMap) {
   for (auto &It : RematMap) {
-    unsigned Reg = It.first;
+    Register Reg = It.first;
     // Skip reg not in live set.
     if (!LiveSet.count(Reg))
       continue;
@@ -669,8 +668,17 @@ void updateLiveInfo(MapVector<Register, RematNode> &RematMap,
   }
 }
 
-int rematGain(MachineInstr *DefMI, unsigned Reg, const MachineRegisterInfo &MRI,
-              const SIRegisterInfo *SIRI, bool IsVGPR) {
+// Returns the actual register saving that would be achieved by moving or
+// cloning this instruction. It's essentially:
+//
+//     size(defs) - size(uses)
+//
+// Note if it is not safe to move/clone this instruction, this function returns
+// 0.
+//
+int rematGainInBits(MachineInstr *DefMI, Register Reg,
+                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                    bool IsVGPR) {
   int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
   for (MachineOperand &MO : DefMI->operands()) {
     if (MO.isImm())
@@ -804,7 +812,7 @@ MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT,
 }
 
 MachineBasicBlock *
-findInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT,
                 MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
                 const MachineRegisterInfo &MRI, bool MemBound) {
 
@@ -869,14 +877,14 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   return true;
 }
 
-void addOneDefOneUseCandidate(RematNode &Node,
-                              std::vector<RematNode> &RematList,
-                              MachineRegisterInfo &MRI, int &RematCnt,
+void addOneDefOneUseCandidate(std::vector<RematNode> *OutRematList,
+                              int *OutRematCnt, const RematNode &Node,
+                              MachineRegisterInfo &MRI,
                               MachineDominatorTree *DT,
                               MachinePostDominatorTree *PDT,
                               MachineLoopInfo *MLI, bool IsVGPR,
                               bool MemBound) {
-  unsigned Reg = Node.Reg;
+  Register Reg = Node.Reg;
   MachineInstr *DefMI = Node.DefMI;
 
   unsigned Size = Node.Size;
@@ -918,24 +926,26 @@ void addOneDefOneUseCandidate(RematNode &Node,
     return;
   }
 
-  Node.InsertBlock = InsertBB;
-  Node.InsertPointMI = UseMI;
-  Node.Kind = RematNode::RematKind::OneDefOneUse;
-  RematList.emplace_back(Node);
-  RematCnt += Size;
+  RematNode FilteredNode = Node;
+  FilteredNode.InsertBlock = InsertBB;
+  FilteredNode.InsertPointMI = UseMI;
+  FilteredNode.Kind = RematNode::RematKind::OneDefOneUse;
+  OutRematList->emplace_back(FilteredNode);
+  *OutRematCnt += Size;
 }
 
-void buildRematCandiates(std::vector<RematNode> &Candidates,
+// Build remat candidates from the registers in `CandidateRegSet`.
+void buildRematCandiates(std::vector<RematNode> *OutCandidates,
+                         DenseSet<Register> *PinnedRegSet,
                          GCNRPTracker::LiveRegSet &CandidateRegSet,
-                         DenseSet<unsigned> &PinnedRegSet,
                          const MachineRegisterInfo &MRI,
                          const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
                          bool IsVGPR) {
 
-  for (auto LiveRegIt : CandidateRegSet) {
-    unsigned Reg = LiveRegIt.first;
+  for (const auto &LiveRegIt : CandidateRegSet) {
+    Register Reg = LiveRegIt.first;
     // Skip unsafe reg.
-    if (PinnedRegSet.count(Reg))
+    if (PinnedRegSet->count(Reg))
       continue;
 
     if (SIRI->isVGPR(MRI, Reg) != IsVGPR)
@@ -966,32 +976,32 @@ void buildRematCandiates(std::vector<RematNode> &Candidates,
     }
 
     if (IsSafeCandidate) {
-      int Gain = rematGain(MI, Reg, MRI, SIRI, IsVGPR);
+      int Gain = rematGainInBits(MI, Reg, MRI, SIRI, IsVGPR);
       if (Gain > 0)
-        Candidates.emplace_back(RematNode(Reg, MI, Gain >> 5));
+        OutCandidates->emplace_back(RematNode(Reg, MI, Gain >> 5));
       else
         IsSafeCandidate = false;
     }
     // Save unsafe reg.
     if (!IsSafeCandidate)
-      PinnedRegSet.insert(Reg);
+      PinnedRegSet->insert(Reg);
   }
 
   // Sort by gain.
-  std::sort(Candidates.begin(), Candidates.end(),
+  std::sort(OutCandidates->begin(), OutCandidates->end(),
             [](RematNode &I, RematNode &J) { return I.Size > J.Size; });
 }
 
-void addCloneCandidate(std::vector<RematNode *> &CloneList,
-                       std::vector<RematNode> &RematList,
-                       DenseSet<unsigned> &PinnedRegSet,
-                       MachineRegisterInfo &MRI, int &RematCnt) {
+void addCloneCandidate(std::vector<RematNode> *OutRematList, int *OutRematCnt,
+                       DenseSet<Register> *OutPinnedRegSet,
+                       std::vector<RematNode *> &&CloneList,
+                       const MachineRegisterInfo &MRI) {
   // Group user in same blocks.
   std::vector<BlockSet> UserSetList(CloneList.size());
 
   for (size_t I = 0; I < CloneList.size(); I++) {
     auto *Node = CloneList[I];
-    unsigned Reg = Node->Reg;
+    Register Reg = Node->Reg;
     MachineInstr *DefMI = Node->DefMI;
     // Group user in same blocks.
     BlockSet &UserSet = UserSetList[I];
@@ -1008,7 +1018,7 @@ void addCloneCandidate(std::vector<RematNode *> &CloneList,
         // Mark cannot remat for now.
         // TODO: try to split if is bigger than 4 and only used once per
         // channel.
-        PinnedRegSet.insert(Reg);
+        OutPinnedRegSet->insert(Reg);
         continue;
       }
     }
@@ -1029,31 +1039,38 @@ void addCloneCandidate(std::vector<RematNode *> &CloneList,
 
   for (RematNode *Node : CloneList) {
     Node->Kind = RematNode::RematKind::Clone;
-    RematList.emplace_back(*Node);
-    RematCnt += Node->Size;
+    OutRematList->emplace_back(*Node);
+    *OutRematCnt += Node->Size;
   }
 }
 
-int filterRematCandiates(std::vector<RematNode> &Candidates,
-                         std::vector<RematNode> &RematList,
-                         DenseSet<unsigned> &PinnedRegSet,
+// Filter `Candidates` into `OutRematList` based on whether
+// safe to move, and decides on the actual type of Candidate (move vs cline).
+//
+// Updates `OutPinnedRegSet` with registers that cannot/should not be moved.
+//
+// Returns the accumulated size of all filtered candidates.
+//
+int filterRematCandiates(std::vector<RematNode> *OutRematList,
+                         DenseSet<Register> *OutPinnedRegSet,
+                         std::vector<RematNode> &&Candidates,
                          MachineDominatorTree *DT,
                          MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
                          MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) {
   int RematCnt = 0;
   // Work one def one use first.
   for (auto &Node : Candidates) {
-    unsigned Reg = Node.Reg;
+    Register Reg = Node.Reg;
     if (!MRI.hasOneNonDBGUse(Reg))
       continue;
 
     MachineInstr *DefMI = Node.DefMI;
     if (!isSafeToMove(DefMI, MRI)) {
-      PinnedRegSet.insert(Reg);
+      OutPinnedRegSet->insert(Reg);
       continue;
     }
 
-    addOneDefOneUseCandidate(Node, RematList, MRI, RematCnt, DT, PDT, MLI,
+    addOneDefOneUseCandidate(OutRematList, &RematCnt, Node, MRI, DT, PDT, MLI,
                              IsVGPR, MemBound);
   }
 
@@ -1061,13 +1078,13 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
     std::vector<RematNode *> CloneList;
     // Try multi use case.
     for (auto &Node : Candidates) {
-      unsigned Reg = Node.Reg;
+      Register Reg = Node.Reg;
       if (MRI.hasOneNonDBGUse(Reg))
         continue;
 
       MachineInstr *DefMI = Node.DefMI;
       if (!isSafeToMove(DefMI, MRI)) {
-        PinnedRegSet.insert(Reg);
+        OutPinnedRegSet->insert(Reg);
         continue;
       }
 
@@ -1075,18 +1092,25 @@ int filterRematCandiates(std::vector<RematNode> &Candidates,
       CloneList.emplace_back(&Node);
     }
 
-    addCloneCandidate(CloneList, RematList, PinnedRegSet, MRI, RematCnt);
+    addCloneCandidate(OutRematList, &RematCnt, OutPinnedRegSet,
+                      std::move(CloneList), MRI);
   }
 
   return RematCnt;
 }
 
-int getReducedSize(MapVector<Register, RematNode> &RematMap,
-                   GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
-                   const MachineRegisterInfo &MRI, BlockLiveInfo &LiveInfo,
-                   DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+// Calculate the reduced register pressure of RematMap w.r.t. the BB associated
+// with LiveInfo.
+// Returns the number of registers reduced, and the instructions associated with
+// the reduction nodes into `OutReducedInsts`.
+int getReducedSize(const MapVector<Register, RematNode> &RematMap,
+                   GCNRPTracker::LiveRegSet &CanidateSet,
+                   const MachineRegisterInfo &MRI,
+                   const BlockLiveInfo &LiveInfo,
+                   DenseMap<const MachineBasicBlock *, unsigned> &RPOTIndexMap,
+                   InstSet *OutReducedInsts) {
   int ReducedSize = 0;
-  for (auto &It : RematMap) {
+  for (const auto &It : RematMap) {
     Register Reg = It.first;
 
     if (!CanidateSet.count(Reg))
@@ -1115,7 +1139,7 @@ int getReducedSize(MapVector<Register, RematNode> &RematMap,
     }
     if (IsReduced) {
       ReducedSize += Node.Size;
-      ReducedInsts.insert(Node.DefMI);
+      OutReducedInsts->insert(Node.DefMI);
     }
 
     // Already in remat map, don't need to check again, remove from candidate.
@@ -1125,11 +1149,15 @@ int getReducedSize(MapVector<Register, RematNode> &RematMap,
   return ReducedSize;
 }
 
-int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
+// Calculate the amount of OVERLAPPING register pressure among all
+// the instructions in `ReducedInsts`. E.g for:
+//    x = COPY a:sgpr_32
+//    y = COPY a:sgpr_32
+// This function would return 1.
+int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR,
                          const MachineRegisterInfo &MRI,
                          const SIRegisterInfo *SIRI) {
 
-  // Find shared operand in ReducedInsts.
   int SharedSize = 0;
   DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
   for (MachineInstr *DefMI : ReducedInsts) {
@@ -1156,6 +1184,7 @@ int getSharedReducedSize(InstSet &ReducedInsts, bool IsVGPR,
       const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
       int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
       unsigned Mask;
+      // FIXME: Lane mask is now in the granularity of 16-bit lanes.
       if (unsigned SubIdx = MO.getSubReg()) {
         OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
         int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
@@ -1219,6 +1248,9 @@ void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
   dbgs() << "Total Size:" << TotalSize << "\n";
 }
 
+// A heuristic number for keeping the target SGPR number away from the limit.
+constexpr unsigned SgprLimitBias = 10;
+
 bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
                                                 MachineLoopInfo *MLI,
                                                 LiveIntervals *LIS,
@@ -1231,8 +1263,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   const SIRegisterInfo *SIRI = ST->getRegisterInfo();
 
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
-  DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
-  for (MachineBasicBlock *MBB : RPOT)
+  DenseMap<const MachineBasicBlock *, unsigned> RPOTIndexMap;
+  for (const MachineBasicBlock *MBB : RPOT)
     RPOTIndexMap[MBB] = RPOTIndexMap.size();
 
   auto &MRI = MF.getRegInfo();
@@ -1244,25 +1276,23 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   if (Status.TargetOcc >= MaxOcc)
     return false;
 
-  unsigned VLimit = Status.TargetVLimit;
-  unsigned SLimit = Status.TargetSLimit;
-
   // Early check for
   {
-    int InitialRematSCnt = Status.MaxSPressure - SLimit;
+    int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit;
     // when agressive sgpr remat, reserve some for allocation lost.
     if (EnableAggressive)
-      InitialRematSCnt += NearTargetRegLimit;
+      InitialRematSCnt += SgprLimitBias;
 
     bool InitialIsSGPRSpill = false;
     if (InitialRematSCnt > 0)
       InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF);
 
-    const bool InitialIsForceRematSgpr = InitialIsSGPRSpill || Status.NotBalance;
+    const bool InitialIsForceRematSgpr =
+        InitialIsSGPRSpill || Status.NotBalance;
 
     // If bound by lds, skip.
     if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
-      !InitialIsForceRematSgpr)
+        !InitialIsForceRematSgpr)
       return false;
   }
 
@@ -1274,7 +1304,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   MapVector<Register, RematNode> VRematMap;
   MapVector<Register, RematNode> SRematMap;
   // Reg which cannot move around to remat.
-  DenseSet<unsigned> PinnedRegSet;
+  DenseSet<Register> PinnedRegSet;
   std::vector<BlockLiveInfo> HotBlocks;
   for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) {
     MachineBasicBlock *MBB = *It;
@@ -1317,7 +1347,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
         MaxSPressure = SPressure;
     }
     MaxSPressure += RegForVCC + Status.InputPhysicalSPressure;
-    if (MaxVPressure <= VLimit && MaxSPressure <= SLimit)
+    if (MaxVPressure <= Status.TargetVLimit &&
+        MaxSPressure <= Status.TargetSLimit)
       continue;
 
     // Build block live info.
@@ -1333,14 +1364,14 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     // Update reg pressure based on remat list.
     InstSet VReducedInsts;
     InstSet SReducedInsts;
-    int VReduced = getReducedSize(VRematMap, CandidateRegs, VReducedInsts, MRI,
-                                  LiveInfo, RPOTIndexMap);
-    int SReduced = getReducedSize(SRematMap, CandidateRegs, SReducedInsts, MRI,
-                                  LiveInfo, RPOTIndexMap);
+    int VReduced = getReducedSize(VRematMap, CandidateRegs, MRI, LiveInfo,
+                                  RPOTIndexMap, &VReducedInsts);
+    int SReduced = getReducedSize(SRematMap, CandidateRegs, MRI, LiveInfo,
+                                  RPOTIndexMap, &SReducedInsts);
 
-    // Calculate size need to be remat.
-    int RematVCnt = MaxVPressure - VReduced - VLimit;
-    int RematSCnt = MaxSPressure - SReduced - SLimit;
+    // Calculate size need to be remat for this BB.
+    const int RematVCnt = MaxVPressure - VReduced - Status.TargetVLimit;
+    const int RematSCnt = MaxSPressure - SReduced - Status.TargetSLimit;
 
     bool IsSGPRSpill = false;
     if (RematSCnt > 0)
@@ -1353,34 +1384,41 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     if (RematSCnt > 0) {
       // Build candidate nodes.
       std::vector<RematNode> SRematCandidates;
-      buildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+      buildRematCandiates(&SRematCandidates, &PinnedRegSet, CandidateRegs, MRI,
                           SIII, SIRI, /*IsVGPR*/ false);
 
       LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
       std::vector<RematNode> SRematList;
       // Filter candidates.
-      NewRematSCnt = filterRematCandiates(SRematCandidates, SRematList,
-                                          PinnedRegSet, DT, PDT, MLI, MRI,
-                                          /*IsVGPR*/ false, Status.MemBound);
+      NewRematSCnt =
+          filterRematCandiates(&SRematList, &PinnedRegSet,
+                               std::move(SRematCandidates), DT, PDT, MLI, MRI,
+                               /*IsVGPR*/ false, Status.MemBound);
       if (NewRematSCnt > RematSCnt) {
         // Has enough remat node to cover rematCnt.
         int RematCnt = 0;
         for (RematNode &Node : SRematList) {
           SRematMap[Node.Reg] = Node;
           RematCnt += Node.Size;
+          // Stop if the size had reached the required amount, unless
+          // aggressive is set.
           if (RematCnt > RematSCnt && !EnableAggressive)
             break;
         }
         NewRematSCnt = 0;
       } else {
-
         for (RematNode &Node : SRematList) {
           SReducedInsts.insert(Node.DefMI);
         }
-        // Check shared size.
+        // Check shared size. These are reg uses that are shared among all the
+        // instructions. The overlap will not actually contribute to the
+        // pressure increase when an instruction is moved/cloned, so it can be
+        // treated as a gain.
         int SharedReducedSize =
             getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI);
-        if (((NewRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+
+        int LocalGains = 0;
+        if (((NewRematSCnt + SharedReducedSize) + (int)SgprLimitBias) >=
             RematSCnt) {
           for (RematNode &Node : SRematList)
             SRematMap[Node.Reg] = Node;
@@ -1408,8 +1446,8 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
             MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
             if (UseMI.getParent() != MBB)
               continue;
-            int Gain = rematGain(&MI, Reg, MRI, SIRI,
-                                 /*IsVGPR*/ false);
+            int Gain = rematGainInBits(&MI, Reg, MRI, SIRI,
+                                       /*IsVGPR*/ false);
             if (Gain > 0) {
               // Skip case when DefMI has implicit define which used by UseMI.
               if (isImplicitDefUse(&MI, &UseMI))
@@ -1418,11 +1456,12 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
               Node.InsertPointMI = &UseMI;
               Node.Kind = RematNode::RematKind::OneDefOneUse;
               SRematMap[Reg] = Node;
-              SharedReducedSize += Node.Size;
+              LocalGains += Node.Size;
             }
           }
         }
-        NewRematSCnt = RematSCnt - NewRematSCnt - SharedReducedSize;
+        NewRematSCnt =
+            RematSCnt - NewRematSCnt - SharedReducedSize - LocalGains;
       }
     }
     // If works, continue.
@@ -1458,7 +1497,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
     }
     // TODO: what to do when cannot reach target?
     if (NewRematSCnt > 0) {
-      if ((unsigned)NewRematSCnt <= NearTargetRegLimit) {
+      if ((unsigned)NewRematSCnt <= ST->getSGPRAllocGranule()) {
         IsNearTarget = true;
       } else {
         if (!IsSGPRSpill)

>From f861409fb5484412179696ae0613be62334af17f Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Mon, 5 May 2025 09:34:31 -0700
Subject: [PATCH 09/11] Deleted the isLocal* functions from Utils

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  4 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 67 -------------------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  8 ---
 3 files changed, 2 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 9aa52ac1cf69e..2d6cc5f010bd5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -443,7 +443,7 @@ unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
       const auto &LI = LIS->getInterval(Reg);
 
       // Skip local live interval to make live input/ouput faster.
-      if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+      if (LIS->intervalIsInOneMBB(LI))
         continue;
 
       for (auto InputIt : MBBInputSlotMap) {
@@ -1276,7 +1276,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   if (Status.TargetOcc >= MaxOcc)
     return false;
 
-  // Early check for
+  // Early checks
   {
     int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit;
     // when agressive sgpr remat, reserve some for allocation lost.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index afa1a8853938f..81395e1ab887c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -36,49 +36,6 @@ bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
 } // namespace llvm
 
 namespace {
-bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
-                    SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
-  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
-  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
-  // Treat non inst as not local.
-  if (!StartMI || !EndMI)
-    return false;
-  // is local when parent MBB the same.
-  bool IsSameMBB = StartMI->getParent() == EndMI->getParent();
-  if (!IsSameMBB)
-    return false;
-  // Collect touched MBB.
-  MachineBasicBlock *MBB = StartMI->getParent();
-  TouchedMBBSet.insert(MBB);
-  return true;
-}
-
-bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
-                      SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
-  for (const LiveRange::Segment &Seg : Range->segments) {
-    if (!isLocalSegment(&Seg, Indexes, TouchedMBBSet))
-      return false;
-  }
-  return true;
-}
-
-bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
-  MachineInstr *StartMI = Indexes->getInstructionFromIndex(Seg->start);
-  MachineInstr *EndMI = Indexes->getInstructionFromIndex(Seg->end);
-  // Treat non inst as not local.
-  if (!StartMI || !EndMI)
-    return false;
-  // is local when parent MBB the same.
-  return StartMI->getParent() == EndMI->getParent();
-}
-
-bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
-  for (const LiveRange::Segment &Seg : Range->segments) {
-    if (!isLocalSegment(&Seg, Indexes))
-      return false;
-  }
-  return true;
-}
 
 // LoopInfo contains a mapping from basic block to the innermost loop. Find
 // the outermost loop in the loop nest that contains BB.
@@ -195,30 +152,6 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
   return MI;
 }
 
-// In case like float4 v, v.x used and defined in one block, v.y used and define
-// in another block, one live interval could touch more than one MBB.
-// TouchedMBBSet is used for scheduling where local live interval could cross
-// multiple regions, need to calculate livereg for each region inside touched
-// MBB.
-bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes,
-                         SmallDenseSet<MachineBasicBlock *, 2> &TouchedMBBSet) {
-  if (LI.hasSubRanges()) {
-    for (const auto &S : LI.subranges())
-      if (!isLocalLiveRange(&S, Indexes, TouchedMBBSet))
-        return false;
-  }
-  return isLocalLiveRange(&LI, Indexes, TouchedMBBSet);
-}
-
-bool isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
-  if (LI.hasSubRanges()) {
-    for (const auto &S : LI.subranges())
-      if (!isLocalLiveRange(&S, Indexes))
-        return false;
-  }
-  return isLocalLiveRange(&LI, Indexes);
-}
-
 void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
 
   dbgs() << "\n live set: \n";
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 2470e2bed482f..d9fa63ba2b5ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -33,14 +33,6 @@ constexpr unsigned RegForVCC = 2;
 bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
                        llvm::MachineBasicBlock &MBB);
 
-// Check if LI live cross basic blocks, save all touched basic block if is
-// local.
-bool isLocalLiveInterval(
-    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
-    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &TouchedMBBSet);
-bool isLocalLiveInterval(const llvm::LiveInterval &LI,
-                         llvm::SlotIndexes *Indexes);
-
 bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
 
 using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;

>From eaaf6ddaf9858c4d1ea34beaac6dcf1694199a60 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Tue, 6 May 2025 19:41:41 -0700
Subject: [PATCH 10/11] Fixed SCC, and updated and simplified tests

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |  69 +-
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 412 ++----------
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |   6 +-
 llvm/test/CodeGen/AMDGPU/remat/phi.mir        | 607 ------------------
 .../CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir | 565 ----------------
 .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 569 +++++-----------
 .../AMDGPU/remat/simple_sgpr_long_scc.mir     | 575 +++++++++++++++++
 .../AMDGPU/remat/simple_sgpr_no_scc.mir       | 564 ++++++++++++++++
 .../CodeGen/AMDGPU/remat/simple_sgpr_phi.mir  | 304 +++++++++
 .../CodeGen/AMDGPU/remat/simple_sgpr_scc.mir  | 564 ++++++++++++++++
 10 files changed, 2233 insertions(+), 2002 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/remat/phi.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 2d6cc5f010bd5..3a0fa5cad4c13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -32,7 +32,7 @@
 using namespace llvm;
 
 static cl::opt<bool>
-    EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+    EnableAggressiveSgpr("amdgpu-remat-enable-hot-block-remat-aggressive-sgpr");
 static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
 
 namespace {
@@ -114,12 +114,14 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
   void applyCloneRemat(RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
                        MachineDominatorTree *DT, MachineRegisterInfo &MRI,
                        SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
-                       const SIInstrInfo *SIII, MachineFunction &MF);
+                       const SIInstrInfo *SIII, LiveIntervals *LIS,
+                       MachineFunction &MF);
   void applyRemat(MapVector<Register, RematNode> &RematMap,
                   std::vector<BlockLiveInfo> &HotBlocks,
                   MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes,
                   MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                  const SIInstrInfo *SIII, MachineFunction &MF);
+                  const SIInstrInfo *SIII, LiveIntervals *LIS,
+                  MachineFunction &MF);
   bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI,
                      LiveIntervals *LIS, MachineDominatorTree *DT,
                      MachinePostDominatorTree *PDT, bool &IsNearTarget);
@@ -140,12 +142,12 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
 MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash(
     MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
     MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS) {
   const bool WillSmashScc =
       InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
   if (WillSmashScc) {
     CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef(
-        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI, LIS);
   }
 
   return CurrentInsertPoint;
@@ -236,7 +238,7 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
     RematNode &Node, std::vector<BlockLiveInfo> &HotBlocks,
     MachineDominatorTree *DT, MachineRegisterInfo &MRI,
     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
-    const SIInstrInfo *SIII, MachineFunction &MF) {
+    const SIInstrInfo *SIII, LiveIntervals *LIS, MachineFunction &MF) {
   Register Reg = Node.Reg;
   MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
 
@@ -289,7 +291,7 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
     }
 
     MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash(
-        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII, LIS);
 
     for (MachineMemOperand *MO : DefMI->memoperands()) {
       NewDef->addMemOperand(MF, MO);
@@ -310,8 +312,6 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
         updateUsers(Reg, NewReg, IsSubRegDef, UserMIs);
       }
     }
-
-    llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
   }
   if (MRI.use_empty(Reg)) {
     SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
@@ -320,8 +320,8 @@ void AMDGPUHotBlockRematerialize::applyCloneRemat(
 
 void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
                             SlotIndexes *SlotIndexes,
-                            const SIRegisterInfo *SIRI,
-                            const SIInstrInfo *SIII) {
+                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                            LiveIntervals *LIS) {
   MachineInstr *DefMI = Node.DefMI;
   MachineInstr *InsertPointMI = Node.InsertPointMI;
   MachineBasicBlock *MBB = nullptr;
@@ -337,7 +337,7 @@ void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
   }
 
   InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
-                                                 SIRI, SIII);
+                                                 SIRI, SIII, LIS);
 
   // Move instruction to new location.
   DefMI->removeFromParent();
@@ -352,7 +352,8 @@ void AMDGPUHotBlockRematerialize::applyRemat(
     MapVector<Register, RematNode> &RematMap,
     std::vector<BlockLiveInfo> &HotBlocks, MachineDominatorTree *DT,
     llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineFunction &MF) {
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS,
+    MachineFunction &MF) {
   std::vector<RematNode> UpdateList;
   for (auto &It : RematMap)
     UpdateList.emplace_back(It.second);
@@ -368,9 +369,10 @@ void AMDGPUHotBlockRematerialize::applyRemat(
 
   for (RematNode &Node : UpdateList) {
     if (Node.Kind == RematNode::RematKind::OneDefOneUse)
-      applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII);
+      applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII, LIS);
     else if (Node.Kind == RematNode::RematKind::Clone)
-      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, MF);
+      applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, LIS,
+                      MF);
   }
 }
 
@@ -617,11 +619,6 @@ bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
   return false;
 }
 
-static unsigned AlignToSgprAllocationGranularity(const GCNSubtarget *ST,
-                                                 unsigned SgprCount) {
-  return llvm::alignTo(SgprCount, ST->getSGPRAllocGranule());
-}
-
 bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST,
                    MachineFunction &MF) {
   unsigned MaxSGPR = ST->getAddressableNumSGPRs();
@@ -720,7 +717,7 @@ int rematGainInBits(MachineInstr *DefMI, Register Reg,
     if (IsSingleDef) {
       // The reg might share with other candidates,  check It here.
       // Count share reg in getReducedSize.
-      if (EnableAggressive) {
+      if (EnableAggressiveSgpr) {
         // In case of aggressive remat, treat multi use reg as shared reg and
         // ignore size of shared reg.
         if (!MRI.hasOneNonDBGUse(Reg))
@@ -858,7 +855,7 @@ bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   return false;
 }
 
-bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+bool isSafeToMoveOrClone(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   // Do not move PHI nodes
   if (isUsedByPhi(DefMI, MRI))
     return false;
@@ -869,7 +866,7 @@ bool isSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
     MachineOperand &Op = DefMI->getOperand(I);
     if (!Op.isReg())
       continue;
-    if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+    if (!Op.getReg().isPhysical() && !MRI.getUniqueVRegDef(Op.getReg()) &&
         !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) {
       return false;
     }
@@ -1065,7 +1062,7 @@ int filterRematCandiates(std::vector<RematNode> *OutRematList,
       continue;
 
     MachineInstr *DefMI = Node.DefMI;
-    if (!isSafeToMove(DefMI, MRI)) {
+    if (!isSafeToMoveOrClone(DefMI, MRI)) {
       OutPinnedRegSet->insert(Reg);
       continue;
     }
@@ -1083,7 +1080,7 @@ int filterRematCandiates(std::vector<RematNode> *OutRematList,
         continue;
 
       MachineInstr *DefMI = Node.DefMI;
-      if (!isSafeToMove(DefMI, MRI)) {
+      if (!isSafeToMoveOrClone(DefMI, MRI)) {
         OutPinnedRegSet->insert(Reg);
         continue;
       }
@@ -1149,6 +1146,12 @@ int getReducedSize(const MapVector<Register, RematNode> &RematMap,
   return ReducedSize;
 }
 
+static unsigned getNumLanesIn32BitReg(bool IsVgpr) {
+  const TargetRegisterClass *RC =
+      IsVgpr ? &AMDGPU::VGPR_32RegClass : &AMDGPU::SGPR_32RegClass;
+  return RC->LaneMask.getNumLanes();
+}
+
 // Calculate the amount of OVERLAPPING register pressure among all
 // the instructions in `ReducedInsts`. E.g for:
 //    x = COPY a:sgpr_32
@@ -1157,7 +1160,6 @@ int getReducedSize(const MapVector<Register, RematNode> &RematMap,
 int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR,
                          const MachineRegisterInfo &MRI,
                          const SIRegisterInfo *SIRI) {
-
   int SharedSize = 0;
   DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
   for (MachineInstr *DefMI : ReducedInsts) {
@@ -1182,8 +1184,9 @@ int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR,
         continue;
 
       const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
-      int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
-      unsigned Mask;
+      const int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+
+      unsigned Mask = 0;
       // FIXME: Lane mask is now in the granularity of 16-bit lanes.
       if (unsigned SubIdx = MO.getSubReg()) {
         OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
@@ -1210,7 +1213,9 @@ int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR,
       }
     }
   }
-  return SharedSize;
+
+  const unsigned NumLanesPerReg = getNumLanesIn32BitReg(IsVGPR);
+  return SharedSize / NumLanesPerReg;
 }
 
 void dumpRematMap(MapVector<Register, RematNode> &RematMap,
@@ -1280,7 +1285,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
   {
     int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit;
     // when agressive sgpr remat, reserve some for allocation lost.
-    if (EnableAggressive)
+    if (EnableAggressiveSgpr)
       InitialRematSCnt += SgprLimitBias;
 
     bool InitialIsSGPRSpill = false;
@@ -1402,7 +1407,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
           RematCnt += Node.Size;
           // Stop if the size had reached the required amount, unless
           // aggressive is set.
-          if (RematCnt > RematSCnt && !EnableAggressive)
+          if (RematCnt > RematSCnt && !EnableAggressiveSgpr)
             break;
         }
         NewRematSCnt = 0;
@@ -1512,7 +1517,7 @@ bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF,
 
   if (!SRematMap.empty()) {
     IsUpdated = true;
-    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, MF);
+    applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, LIS, MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 81395e1ab887c..4c55d172018d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -17,6 +17,7 @@
 #include "SIRegisterInfo.h"
 
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -60,25 +61,14 @@ bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
 
 namespace llvm {
 
-bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
-                 llvm::MachineBasicBlock::iterator MI) {
-  const TargetRegisterInfo *TRI =
-      MBB->getParent()->getRegInfo().getTargetRegisterInfo();
-  for (auto It = MI; It != MBB->end(); ++It) {
-    const MachineInstr &CurMI = *It;
-    // Hit use of scc, it is live.
-    if (CurMI.readsRegister(AMDGPU::SCC, TRI))
-      return true;
-    // Hit def of scc first, not live.
-    if (CurMI.definesRegister(AMDGPU::SCC, TRI))
-      return false;
-  }
-  // Reach the end of MBB, check live-ins of MBB successors.
-  for (const MachineBasicBlock *Succ : MBB->successors()) {
-    if (Succ->isLiveIn(AMDGPU::SCC))
-      return true;
-  }
-  return false;
+bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS) {
+  if (!LIS)
+    return true;
+  const TargetRegisterInfo *TRI = MI.getMF()->getSubtarget().getRegisterInfo();
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+  return LR.liveAt(Idx);
 }
 
 //
@@ -95,21 +85,16 @@ bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
 MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
     const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
-    MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+    MachineRegisterInfo *MRI, LiveIntervals *LIS,
+    SccDefInsertPointConstraintFlags Constraints) {
   // If SCC is dead at MI when we can use MI as the insert point.
-  if (!llvm::isSccLiveAt(MBB, MI))
+  if (!llvm::isSccLiveAt(*MI, LIS))
     return MI;
 
   const bool CheckForExecWrite =
       Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
 
-  // Get the starting reverse iterator taking care to handle the MBB->end()
-  // case.
-  MachineBasicBlock::reverse_iterator Start;
-  if (MI == MBB->end())
-    Start = MBB->rbegin();
-  else
-    Start = MI.getReverse();
+  MachineBasicBlock::reverse_iterator Start = MI.getReverse();
 
   // Otherwise, walk backwards through the block looking for a location where
   // SCC is dead.
@@ -122,8 +107,7 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
       break;
 
-    if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
-        !It->readsRegister(AMDGPU::SCC, TRI))
+    if (!llvm::isSccLiveAt(*It, LIS))
       return It->getIterator();
   }
 
@@ -134,20 +118,35 @@ MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
   //
   // The generated code will look like this;
   //
-  //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+  //      %SavedSCC = COPY $scc  # Save SCC
   //      <----- Newly created safe insert point.
   //      MI
-  //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+  //      $scc = COPY %SavedSCC  # Restore SCC
   //
   Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   DebugLoc DL = MI->getDebugLoc();
-  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
-      .addImm(-1)
-      .addImm(0);
-  BuildMI(*MBB, std::next(MI->getIterator()), DL,
-          TII->get(AMDGPU::S_CMP_LG_U32))
-      .addReg(TmpScc, RegState::Kill)
-      .addImm(0);
+  auto CopyFrom =
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), TmpScc).addReg(AMDGPU::SCC);
+  auto CopyTo = BuildMI(*MBB, std::next(MI->getIterator()), DL,
+                        TII->get(AMDGPU::COPY), AMDGPU::SCC)
+                    .addReg(TmpScc);
+
+  // Cut the live segment.
+  auto SlotIndexes = LIS->getSlotIndexes();
+  SlotIndexes->insertMachineInstrInMaps(*CopyFrom);
+  SlotIndexes->insertMachineInstrInMaps(*CopyTo);
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  auto OldSegment = *LR.getSegmentContaining(LIS->getInstructionIndex(*MI));
+  LiveRange::Segment NewSegA(
+      OldSegment.start,
+      SlotIndexes->getInstructionIndex(*CopyFrom).getRegSlot(),
+      OldSegment.valno);
+  LiveRange::Segment NewSegB(LIS->getInstructionIndex(*CopyTo).getRegSlot(),
+                             OldSegment.end, OldSegment.valno);
+  LR.removeSegment(OldSegment);
+  LR.addSegment(NewSegA);
+  LR.addSegment(NewSegB);
 
   return MI;
 }
@@ -164,341 +163,6 @@ void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
   }
 }
 
-LaneBitmask getRegMask(const MachineOperand &MO,
-                       const MachineRegisterInfo &MRI) {
-  // We don't rely on read-undef_ flag because in case of tentative schedule
-  // tracking it isn't set correctly yet. This works correctly however since
-  // use mask has been tracked before using LIS.
-  return MO.getSubReg() == 0
-             ? MRI.getMaxLaneMaskForVReg(MO.getReg())
-             : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
-                   MO.getSubReg());
-}
-
-struct Piece {
-  unsigned Reg;
-  unsigned Offset;
-  unsigned Size;
-  static SmallVector<Piece, 8> split(std::bitset<32> Mask) {
-
-    SmallVector<Piece, 8> Pieces;
-    Piece Piece = {0, 0, 0};
-    for (unsigned i = 0; i < 32; i++) {
-      if (Mask.test(i)) {
-        if (Piece.Size == 0)
-          Piece.Offset = i;
-
-        Piece.Size++;
-        // Make sure no piece bigger than 8.
-        if (Piece.Size == 8) {
-          Pieces.emplace_back(Piece);
-          Piece.Size = 0;
-        }
-      } else {
-        if (Piece.Size == 0) {
-          continue;
-        }
-        Pieces.emplace_back(Piece);
-        Piece.Size = 0;
-      }
-    }
-    return Pieces;
-  }
-};
-
-static unsigned getNumLanesIn32BitReg(Register Reg, const SIRegisterInfo *SIRI,
-                                      const MachineRegisterInfo &MRI) {
-  const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
-  const TargetRegisterClass *SubregRC =
-      SIRI->getSubRegisterClass(RC, AMDGPU::sub0);
-  return SubregRC->LaneMask.getNumLanes();
-}
-
-static std::vector<unsigned>
-getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterInfo *TRI,
-                                          const TargetRegisterClass *RC,
-                                          LaneBitmask Mask) {
-  // TODO: this could replace the code it was copied from in SplitKit.cpp
-
-  // First pass: Try to find a perfectly matching subregister index.
-  // If none exists find the one covering the most lanemask bits.
-  SmallVector<unsigned, 8> PossibleIndexes;
-  unsigned BestIdx = 0;
-  const LaneBitmask Avoid = ~Mask;
-  {
-    unsigned BestCover = 0;
-    for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
-      // Is this index even compatible with the given class?
-      if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
-        continue;
-      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
-      // Early exit if we found a perfect match.
-      if (SubRegMask == Mask) {
-        BestIdx = Idx;
-        break;
-      }
-
-      // The index must not cover any lanes outside
-      if ((SubRegMask & Avoid).any())
-        continue;
-
-      unsigned PopCount = SubRegMask.getNumLanes();
-      PossibleIndexes.push_back(Idx);
-      if (PopCount > BestCover) {
-        BestCover = PopCount;
-        BestIdx = Idx;
-      }
-    }
-  }
-
-  // Abort if we cannot possibly implement the COPY with the given indexes.
-  if (BestIdx == 0) {
-    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                      << TRI->getRegClassName(RC) << " mask "
-                      << PrintLaneMask(Mask) << '\n');
-    assert(false && "Impossible to span reg class");
-    return std::vector<unsigned>();
-  }
-
-  std::vector<unsigned> Result;
-  Result.push_back(BestIdx);
-
-  // Greedy heuristic: Keep iterating keeping the best covering subreg index
-  // each time.
-  Mask &= ~(TRI->getSubRegIndexLaneMask(BestIdx));
-  while (Mask.any()) {
-    BestIdx = 0;
-    int BestCover = std::numeric_limits<int>::min();
-    for (unsigned Idx : PossibleIndexes) {
-      LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
-      // Early exit if we found a perfect match.
-      if (SubRegMask == Mask) {
-        BestIdx = Idx;
-        break;
-      }
-
-      // Guaranteed above
-      assert((SubRegMask & Avoid).none());
-
-      // Try to cover as much of the remaining lanes as possible but as few of
-      // the already covered lanes as possible.
-      int Cover = (SubRegMask & Mask).getNumLanes() -
-                  (SubRegMask & ~Mask).getNumLanes();
-      if (Cover > BestCover) {
-        BestCover = Cover;
-        BestIdx = Idx;
-      }
-    }
-
-    if (BestIdx == 0) {
-      LLVM_DEBUG(
-          dbgs() << "Unable to find minimal spanning sub register(s) for "
-                 << TRI->getRegClassName(RC) << " mask " << PrintLaneMask(Mask)
-                 << '\n');
-      assert(false && "Impossible to span reg class");
-      return std::vector<unsigned>();
-    }
-
-    Result.push_back(BestIdx);
-    Mask &= ~TRI->getSubRegIndexLaneMask(BestIdx);
-  }
-
-  return Result;
-}
-
-static void updateSubReg(MachineOperand &UseMO,
-                         const llvm::TargetRegisterClass *NewRC,
-                         unsigned Offset, const SIRegisterInfo *SIRI) {
-  unsigned Size = NewRC->getLaneMask().getNumLanes();
-  if (Size == 1) {
-    UseMO.setSubReg(0);
-  } else {
-    const uint32_t SubReg = UseMO.getSubReg();
-    LaneBitmask LaneMask = SIRI->getSubRegIndexLaneMask(SubReg);
-
-    unsigned Mask = LaneMask.getAsInteger() >> Offset;
-
-    unsigned NewSubReg = getMinimalSpanningSubRegIdxSetForLaneMask(
-                             SIRI, NewRC, LaneBitmask(Mask))
-                             .front();
-
-    UseMO.setSubReg(NewSubReg);
-  }
-}
-
-bool reduceChannel(unsigned Offset, MachineInstr &MI, const MCInstrDesc &Desc,
-                   MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                   const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
-  MachineOperand &DstMO = MI.getOperand(0);
-  // Skip case when dst subReg not 0.
-  if (DstMO.getSubReg())
-    return false;
-  Register Reg = DstMO.getReg();
-
-  SmallVector<MachineOperand *, 2> UseMOs;
-  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg))
-    UseMOs.emplace_back(&UseMO);
-
-  const llvm::TargetRegisterClass *NewRC =
-      SIRI->getRegClass(Desc.operands().front().RegClass);
-  if (!NewRC->isAllocatable()) {
-    if (SIRI->isSGPRClass(NewRC))
-      NewRC = SIRI->getSGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
-    else if (SIRI->isVGPRClass(NewRC))
-      NewRC = SIRI->getVGPRClassForBitWidth(NewRC->MC->RegSizeInBits);
-    else
-      return false;
-
-    if (!NewRC->isAllocatable())
-      return false;
-  }
-
-  unsigned NumLanes = NewRC->getLaneMask().getNumLanes();
-  if (Offset > 0) {
-    // Update offset operand in MI.
-    MachineOperand *OffsetOp =
-        SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
-
-    const uint32_t LaneSize = sizeof(uint32_t);
-    if (OffsetOp) {
-      if (OffsetOp->isImm()) {
-        assert(OffsetOp != nullptr);
-        int64_t Offset = OffsetOp->getImm();
-        Offset += Offset * LaneSize;
-        if (!SIII->isLegalMUBUFImmOffset(Offset))
-          return false;
-        OffsetOp->setImm(Offset);
-      } else {
-        return false;
-      }
-    } else {
-      OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
-      if (OffsetOp) {
-        Register NewOffsetReg =
-            MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-        auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
-                                 SIII->get(AMDGPU::S_ADD_U32))
-                             .addDef(NewOffsetReg)
-                             .add(*OffsetOp)
-                             .addImm(Offset * LaneSize);
-        MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
-        MachineBasicBlock::iterator InsertPoint =
-            llvm::findOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
-                                                      SIII, &MRI);
-        MI.getParent()->insert(InsertPoint, OffsetAddMI);
-        SIII->legalizeOperands(*OffsetAddMI);
-        OffsetOp->setReg(NewOffsetReg);
-        OffsetOp->setSubReg(0);
-        if (SlotIndexes)
-          SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
-      } else {
-        return false;
-      }
-    }
-    // Update subReg for users.
-    for (MachineOperand *UseMO : UseMOs)
-      updateSubReg(*UseMO, NewRC, Offset, SIRI);
-  } else if (NumLanes == getNumLanesIn32BitReg(Reg, SIRI, MRI)) {
-    // Clear subReg when it's a single 32-bit reg.
-    for (MachineOperand *UseMO : UseMOs)
-      UseMO->setSubReg(0);
-  }
-
-  MI.setDesc(Desc);
-  // Mutate reg class of Reg.
-  MRI.setRegClass(Reg, NewRC);
-  return true;
-}
-
-bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
-                       const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
-                       SlotIndexes *SlotIndexes) {
-  bool IsImm = false;
-  switch (MI.getOpcode()) {
-  default:
-    break;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
-    Register Reg = MI.getOperand(0).getReg();
-    if (!MRI.getUniqueVRegDef(Reg))
-      return false;
-    LaneBitmask DstMask = getRegMask(MI.getOperand(0), MRI);
-    LaneBitmask UseMask;
-    for (MachineOperand &MO : MRI.use_operands(Reg))
-      UseMask |= llvm::getRegMask(MO, MRI);
-
-    const unsigned FullMask = DstMask.getAsInteger();
-    unsigned Mask = UseMask.getAsInteger();
-    if (Mask == FullMask)
-      return false;
-    // Split mask when there's gap. Then group mask to 2/4/8.
-    auto Pieces = Piece::split(std::bitset<32>(Mask));
-    // Now only support 1 piece.
-    if (Pieces.size() != 1)
-      return false;
-    auto Piece = Pieces[0];
-    if (Piece.Size > 8)
-      return false;
-
-    // TODO: enable offset support when IsImm is true.
-    // Now if break different test when mul LaneSize or not mul for the offset.
-    if (IsImm && Piece.Offset != 0)
-      return false;
-
-    const unsigned Num32BitLanes =
-        Piece.Size / getNumLanesIn32BitReg(Reg, SIRI, MRI);
-
-    switch (Num32BitLanes) {
-    default:
-      return false;
-    case 1:
-      return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
-                                           : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
-                           MRI, SIRI, SIII, SlotIndexes);
-    case 2:
-      return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm
-                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
-                                         : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
-                           MRI, SIRI, SIII, SlotIndexes);
-    case 3:
-      if (FullMask == 0xff)
-        return false;
-      LLVM_FALLTHROUGH;
-    case 4:
-      return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm
-                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
-                                         : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
-                           MRI, SIRI, SIII, SlotIndexes);
-    case 5:
-    case 6:
-    case 7:
-      if (FullMask == 0xffff)
-        return false;
-      LLVM_FALLTHROUGH;
-    case 8:
-      return reduceChannel(Piece.Offset, MI,
-                           SIII->get(IsImm
-                                         ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
-                                         : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
-                           MRI, SIRI, SIII, SlotIndexes);
-    }
-
-  } break;
-  }
-  return false;
-}
-
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index d9fa63ba2b5ee..14cd350398f4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -21,6 +21,7 @@
 namespace llvm {
 
 class LiveInterval;
+class LiveIntervals;
 class SlotIndexes;
 class MachineRegisterInfo;
 class SIRegisterInfo;
@@ -38,8 +39,7 @@ bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
 using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
 
-bool isSccLiveAt(llvm::MachineBasicBlock *MBB,
-                 llvm::MachineBasicBlock::iterator MI);
+bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS);
 
 // An enum used to pass additional constraints to
 // `FindOrCreateInsertionPointForSccDef()`. This will further
@@ -66,7 +66,7 @@ enum SccDefInsertPointConstraintFlags {
 llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef(
     llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
     const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
-    llvm::MachineRegisterInfo *MRI,
+    llvm::MachineRegisterInfo *MRI, LiveIntervals *LIS,
     SccDefInsertPointConstraintFlags Constraints =
         SccDefInsertPointConstraintFlags::None);
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat/phi.mir b/llvm/test/CodeGen/AMDGPU/remat/phi.mir
deleted file mode 100644
index 2d22e9fba2593..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/remat/phi.mir
+++ /dev/null
@@ -1,607 +0,0 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s  -amdgpu-remat-enable-hot-block-remat-aggressive  -run-pass=amdgpu-hot-block-remat -o - | FileCheck %s
-
-# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are
-# present.
-
-# CHECK: S_ENDPGM
-
---- |
-  source_filename = ".\main.ll"
-  define amdgpu_ps void @main() #1 {
-    ret void
-  }
-  attributes #1 = { "target-cpu"="gfx1010" }
-  !llvm.ident = !{!0}
-  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
-...
----
-name:            main
-tracksRegLiveness: true
-liveins:
-  - { reg: '$sgpr0' }
-  - { reg: '$sgpr1' }
-  - { reg: '$sgpr2' }
-  - { reg: '$sgpr3' }
-  - { reg: '$sgpr4' }
-  - { reg: '$sgpr5' }
-  - { reg: '$sgpr6' }
-  - { reg: '$sgpr7' }
-  - { reg: '$sgpr8' }
-  - { reg: '$sgpr8' }
-  - { reg: '$vgpr0' }
-  - { reg: '$vgpr1' }
-body:             |
-  bb.0:
-    successors: %bb.1, %bb.2
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
-
-    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
-    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
-    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
-
-    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
-    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
-    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
-    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
-    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
-
-
-    %2000:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2001:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2002:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2003:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2004:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2005:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2006:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2007:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2008:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2009:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2010:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2011:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2012:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2013:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2014:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2015:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2016:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2017:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2018:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2019:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2020:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2021:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2022:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2023:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2024:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2025:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2026:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2027:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2028:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2029:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2030:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2031:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2032:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2033:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2034:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2035:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2036:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2037:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2038:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2039:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2040:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2041:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2042:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2043:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2044:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2045:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2046:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2047:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2048:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2049:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2050:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2051:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2052:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2053:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2054:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2055:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2056:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2057:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2058:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2059:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2060:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2061:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2062:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2063:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2064:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2065:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2066:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2067:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2068:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2069:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2070:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2071:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2072:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2073:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2074:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2075:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2076:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2077:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2078:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2079:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2080:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2081:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2082:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2083:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2084:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2085:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2086:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2087:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2088:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2089:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2090:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2091:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2092:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2093:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2094:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2095:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2096:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2097:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2098:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %2099:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    %3000:sgpr_32 = S_MOV_B32 0
-    %3001:sgpr_32 = S_MOV_B32 1
-    %3002:sgpr_32 = S_MOV_B32 2
-    %3003:sgpr_32 = S_MOV_B32 3
-    %3004:sgpr_32 = S_MOV_B32 4
-    %3005:sgpr_32 = S_MOV_B32 5
-    %3006:sgpr_32 = S_MOV_B32 6
-    %3007:sgpr_32 = S_MOV_B32 7
-    %3008:sgpr_32 = S_MOV_B32 8
-    %3009:sgpr_32 = S_MOV_B32 9
-    %3010:sgpr_32 = S_MOV_B32 10
-    %3011:sgpr_32 = S_MOV_B32 11
-    %3012:sgpr_32 = S_MOV_B32 12
-    %3013:sgpr_32 = S_MOV_B32 13
-    %3014:sgpr_32 = S_MOV_B32 14
-    %3015:sgpr_32 = S_MOV_B32 15
-    %3016:sgpr_32 = S_MOV_B32 16
-    %3017:sgpr_32 = S_MOV_B32 17
-    %3018:sgpr_32 = S_MOV_B32 18
-    %3019:sgpr_32 = S_MOV_B32 19
-    %3020:sgpr_32 = S_MOV_B32 20
-    %3021:sgpr_32 = S_MOV_B32 21
-    %3022:sgpr_32 = S_MOV_B32 22
-    %3023:sgpr_32 = S_MOV_B32 23
-    %3024:sgpr_32 = S_MOV_B32 24
-    %3025:sgpr_32 = S_MOV_B32 25
-    %3026:sgpr_32 = S_MOV_B32 26
-    %3027:sgpr_32 = S_MOV_B32 27
-    %3028:sgpr_32 = S_MOV_B32 28
-    %3029:sgpr_32 = S_MOV_B32 29
-    %3030:sgpr_32 = S_MOV_B32 30
-    %3031:sgpr_32 = S_MOV_B32 31
-    %3032:sgpr_32 = S_MOV_B32 32
-    %3033:sgpr_32 = S_MOV_B32 33
-    %3034:sgpr_32 = S_MOV_B32 34
-    %3035:sgpr_32 = S_MOV_B32 35
-    %3036:sgpr_32 = S_MOV_B32 36
-    %3037:sgpr_32 = S_MOV_B32 37
-    %3038:sgpr_32 = S_MOV_B32 38
-    %3039:sgpr_32 = S_MOV_B32 39
-    %3040:sgpr_32 = S_MOV_B32 40
-    %3041:sgpr_32 = S_MOV_B32 41
-    %3042:sgpr_32 = S_MOV_B32 42
-    %3043:sgpr_32 = S_MOV_B32 43
-    %3044:sgpr_32 = S_MOV_B32 44
-    %3045:sgpr_32 = S_MOV_B32 45
-    %3046:sgpr_32 = S_MOV_B32 46
-    %3047:sgpr_32 = S_MOV_B32 47
-    %3048:sgpr_32 = S_MOV_B32 48
-    %3049:sgpr_32 = S_MOV_B32 49
-    %3050:sgpr_32 = S_MOV_B32 50
-    %3051:sgpr_32 = S_MOV_B32 51
-    %3052:sgpr_32 = S_MOV_B32 52
-    %3053:sgpr_32 = S_MOV_B32 53
-    %3054:sgpr_32 = S_MOV_B32 54
-    %3055:sgpr_32 = S_MOV_B32 55
-    %3056:sgpr_32 = S_MOV_B32 56
-    %3057:sgpr_32 = S_MOV_B32 57
-    %3058:sgpr_32 = S_MOV_B32 58
-    %3059:sgpr_32 = S_MOV_B32 59
-    %3060:sgpr_32 = S_MOV_B32 60
-    %3061:sgpr_32 = S_MOV_B32 61
-    %3062:sgpr_32 = S_MOV_B32 62
-    %3063:sgpr_32 = S_MOV_B32 63
-    %3064:sgpr_32 = S_MOV_B32 64
-    %3065:sgpr_32 = S_MOV_B32 65
-    %3066:sgpr_32 = S_MOV_B32 66
-    %3067:sgpr_32 = S_MOV_B32 67
-    %3068:sgpr_32 = S_MOV_B32 68
-    %3069:sgpr_32 = S_MOV_B32 69
-    %3070:sgpr_32 = S_MOV_B32 70
-    %3071:sgpr_32 = S_MOV_B32 71
-    %3072:sgpr_32 = S_MOV_B32 72
-    %3073:sgpr_32 = S_MOV_B32 73
-    %3074:sgpr_32 = S_MOV_B32 74
-    %3075:sgpr_32 = S_MOV_B32 75
-    %3076:sgpr_32 = S_MOV_B32 76
-    %3077:sgpr_32 = S_MOV_B32 77
-    %3078:sgpr_32 = S_MOV_B32 78
-    %3079:sgpr_32 = S_MOV_B32 79
-    %3080:sgpr_32 = S_MOV_B32 80
-    %3081:sgpr_32 = S_MOV_B32 81
-    %3082:sgpr_32 = S_MOV_B32 82
-    %3083:sgpr_32 = S_MOV_B32 83
-    %3084:sgpr_32 = S_MOV_B32 84
-    %3085:sgpr_32 = S_MOV_B32 85
-    %3086:sgpr_32 = S_MOV_B32 86
-    %3087:sgpr_32 = S_MOV_B32 87
-    %3088:sgpr_32 = S_MOV_B32 88
-    %3089:sgpr_32 = S_MOV_B32 89
-    %3090:sgpr_32 = S_MOV_B32 90
-    %3091:sgpr_32 = S_MOV_B32 91
-    %3092:sgpr_32 = S_MOV_B32 92
-    %3093:sgpr_32 = S_MOV_B32 93
-    %3094:sgpr_32 = S_MOV_B32 94
-    %3095:sgpr_32 = S_MOV_B32 95
-    %3096:sgpr_32 = S_MOV_B32 96
-    %3097:sgpr_32 = S_MOV_B32 97
-    %3098:sgpr_32 = S_MOV_B32 98
-    %3099:sgpr_32 = S_MOV_B32 99
-
-
-    %8000:vgpr_32 = IMPLICIT_DEF
-    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
-    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
-    S_CBRANCH_EXECZ %bb.2, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:  
-    successors: %bb.2
-
-    %8001:vgpr_32 = COPY %8000
-    %8002:vgpr_32 = COPY %8000
-    %8003:vgpr_32 = COPY %8000
-    %8004:vgpr_32 = COPY %8000
-    %8005:vgpr_32 = COPY %8000
-    %8006:vgpr_32 = COPY %8000
-    %8007:vgpr_32 = COPY %8000
-    %8008:vgpr_32 = COPY %8000
-    %8009:vgpr_32 = COPY %8000
-    %8010:vgpr_32 = COPY %8000
-    %8011:vgpr_32 = COPY %8000
-    %8012:vgpr_32 = COPY %8000
-    %8013:vgpr_32 = COPY %8000
-    %8014:vgpr_32 = COPY %8000
-    %8015:vgpr_32 = COPY %8000
-    %8016:vgpr_32 = COPY %8000
-    %8017:vgpr_32 = COPY %8000
-
-    %9001:vgpr_32 = COPY %8001
-    %9002:vgpr_32 = COPY %8002
-    %9003:vgpr_32 = COPY %8003
-    %9004:vgpr_32 = COPY %8004
-    %9005:vgpr_32 = COPY %8005
-    %9006:vgpr_32 = COPY %8006
-    %9007:vgpr_32 = COPY %8007
-    %9008:vgpr_32 = COPY %8008
-    %9009:vgpr_32 = COPY %8009
-    %9010:vgpr_32 = COPY %8010
-    %9011:vgpr_32 = COPY %8011
-    %9012:vgpr_32 = COPY %8012
-    %9013:vgpr_32 = COPY %8013
-    %9014:vgpr_32 = COPY %8014
-    %9015:vgpr_32 = COPY %8015
-    %9016:vgpr_32 = COPY %8016
-    %9017:vgpr_32 = COPY %8017
-
-    S_BRANCH %bb.2
-
-  bb.2:
-    %5000:sgpr_32 = PHI %3000, %bb.0, %8001, %bb.1
-    %5001:sgpr_32 = PHI %3001, %bb.0, %8001, %bb.1
-    %5002:sgpr_32 = PHI %3002, %bb.0, %8001, %bb.1
-    %5003:sgpr_32 = PHI %3003, %bb.0, %8001, %bb.1
-    %5004:sgpr_32 = PHI %3004, %bb.0, %8001, %bb.1
-    %5005:sgpr_32 = PHI %3005, %bb.0, %8001, %bb.1
-    %5006:sgpr_32 = PHI %3006, %bb.0, %8001, %bb.1
-    %5007:sgpr_32 = PHI %3007, %bb.0, %8001, %bb.1
-    %5008:sgpr_32 = PHI %3008, %bb.0, %8001, %bb.1
-    %5009:sgpr_32 = PHI %3009, %bb.0, %8001, %bb.1
-    %5010:sgpr_32 = PHI %3010, %bb.0, %8001, %bb.1
-    %5011:sgpr_32 = PHI %3011, %bb.0, %8001, %bb.1
-    %5012:sgpr_32 = PHI %3012, %bb.0, %8001, %bb.1
-    %5013:sgpr_32 = PHI %3013, %bb.0, %8001, %bb.1
-    %5014:sgpr_32 = PHI %3014, %bb.0, %8001, %bb.1
-    %5015:sgpr_32 = PHI %3015, %bb.0, %8001, %bb.1
-    %5016:sgpr_32 = PHI %3016, %bb.0, %8001, %bb.1
-    %5017:sgpr_32 = PHI %3017, %bb.0, %8001, %bb.1
-    %5018:sgpr_32 = PHI %3018, %bb.0, %8001, %bb.1
-    %5019:sgpr_32 = PHI %3019, %bb.0, %8001, %bb.1
-    %5020:sgpr_32 = PHI %3020, %bb.0, %8001, %bb.1
-    %5021:sgpr_32 = PHI %3021, %bb.0, %8001, %bb.1
-    %5022:sgpr_32 = PHI %3022, %bb.0, %8001, %bb.1
-    %5023:sgpr_32 = PHI %3023, %bb.0, %8001, %bb.1
-    %5024:sgpr_32 = PHI %3024, %bb.0, %8001, %bb.1
-    %5025:sgpr_32 = PHI %3025, %bb.0, %8001, %bb.1
-    %5026:sgpr_32 = PHI %3026, %bb.0, %8001, %bb.1
-    %5027:sgpr_32 = PHI %3027, %bb.0, %8001, %bb.1
-    %5028:sgpr_32 = PHI %3028, %bb.0, %8001, %bb.1
-    %5029:sgpr_32 = PHI %3029, %bb.0, %8001, %bb.1
-    %5030:sgpr_32 = PHI %3030, %bb.0, %8001, %bb.1
-    %5031:sgpr_32 = PHI %3031, %bb.0, %8001, %bb.1
-    %5032:sgpr_32 = PHI %3032, %bb.0, %8001, %bb.1
-    %5033:sgpr_32 = PHI %3033, %bb.0, %8001, %bb.1
-    %5034:sgpr_32 = PHI %3034, %bb.0, %8001, %bb.1
-    %5035:sgpr_32 = PHI %3035, %bb.0, %8001, %bb.1
-    %5036:sgpr_32 = PHI %3036, %bb.0, %8001, %bb.1
-    %5037:sgpr_32 = PHI %3037, %bb.0, %8001, %bb.1
-    %5038:sgpr_32 = PHI %3038, %bb.0, %8001, %bb.1
-    %5039:sgpr_32 = PHI %3039, %bb.0, %8001, %bb.1
-    %5040:sgpr_32 = PHI %3040, %bb.0, %8001, %bb.1
-    %5041:sgpr_32 = PHI %3041, %bb.0, %8001, %bb.1
-    %5042:sgpr_32 = PHI %3042, %bb.0, %8001, %bb.1
-    %5043:sgpr_32 = PHI %3043, %bb.0, %8001, %bb.1
-    %5044:sgpr_32 = PHI %3044, %bb.0, %8001, %bb.1
-    %5045:sgpr_32 = PHI %3045, %bb.0, %8001, %bb.1
-    %5046:sgpr_32 = PHI %3046, %bb.0, %8001, %bb.1
-    %5047:sgpr_32 = PHI %3047, %bb.0, %8001, %bb.1
-    %5048:sgpr_32 = PHI %3048, %bb.0, %8001, %bb.1
-    %5049:sgpr_32 = PHI %3049, %bb.0, %8001, %bb.1
-    %5050:sgpr_32 = PHI %3050, %bb.0, %8001, %bb.1
-    %5051:sgpr_32 = PHI %3051, %bb.0, %8001, %bb.1
-    %5052:sgpr_32 = PHI %3052, %bb.0, %8001, %bb.1
-    %5053:sgpr_32 = PHI %3053, %bb.0, %8001, %bb.1
-    %5054:sgpr_32 = PHI %3054, %bb.0, %8001, %bb.1
-    %5055:sgpr_32 = PHI %3055, %bb.0, %8001, %bb.1
-    %5056:sgpr_32 = PHI %3056, %bb.0, %8001, %bb.1
-    %5057:sgpr_32 = PHI %3057, %bb.0, %8001, %bb.1
-    %5058:sgpr_32 = PHI %3058, %bb.0, %8001, %bb.1
-    %5059:sgpr_32 = PHI %3059, %bb.0, %8001, %bb.1
-    %5060:sgpr_32 = PHI %3060, %bb.0, %8001, %bb.1
-    %5061:sgpr_32 = PHI %3061, %bb.0, %8001, %bb.1
-    %5062:sgpr_32 = PHI %3062, %bb.0, %8001, %bb.1
-    %5063:sgpr_32 = PHI %3063, %bb.0, %8001, %bb.1
-    %5064:sgpr_32 = PHI %3064, %bb.0, %8001, %bb.1
-    %5065:sgpr_32 = PHI %3065, %bb.0, %8001, %bb.1
-    %5066:sgpr_32 = PHI %3066, %bb.0, %8001, %bb.1
-    %5067:sgpr_32 = PHI %3067, %bb.0, %8001, %bb.1
-    %5068:sgpr_32 = PHI %3068, %bb.0, %8001, %bb.1
-    %5069:sgpr_32 = PHI %3069, %bb.0, %8001, %bb.1
-    %5070:sgpr_32 = PHI %3070, %bb.0, %8001, %bb.1
-    %5071:sgpr_32 = PHI %3071, %bb.0, %8001, %bb.1
-    %5072:sgpr_32 = PHI %3072, %bb.0, %8001, %bb.1
-    %5073:sgpr_32 = PHI %3073, %bb.0, %8001, %bb.1
-    %5074:sgpr_32 = PHI %3074, %bb.0, %8001, %bb.1
-    %5075:sgpr_32 = PHI %3075, %bb.0, %8001, %bb.1
-    %5076:sgpr_32 = PHI %3076, %bb.0, %8001, %bb.1
-    %5077:sgpr_32 = PHI %3077, %bb.0, %8001, %bb.1
-    %5078:sgpr_32 = PHI %3078, %bb.0, %8001, %bb.1
-    %5079:sgpr_32 = PHI %3079, %bb.0, %8001, %bb.1
-    %5080:sgpr_32 = PHI %3080, %bb.0, %8001, %bb.1
-    %5081:sgpr_32 = PHI %3081, %bb.0, %8001, %bb.1
-    %5082:sgpr_32 = PHI %3082, %bb.0, %8001, %bb.1
-    %5083:sgpr_32 = PHI %3083, %bb.0, %8001, %bb.1
-    %5084:sgpr_32 = PHI %3084, %bb.0, %8001, %bb.1
-    %5085:sgpr_32 = PHI %3085, %bb.0, %8001, %bb.1
-    %5086:sgpr_32 = PHI %3086, %bb.0, %8001, %bb.1
-    %5087:sgpr_32 = PHI %3087, %bb.0, %8001, %bb.1
-    %5088:sgpr_32 = PHI %3088, %bb.0, %8001, %bb.1
-    %5089:sgpr_32 = PHI %3089, %bb.0, %8001, %bb.1
-    %5090:sgpr_32 = PHI %3090, %bb.0, %8001, %bb.1
-    %5091:sgpr_32 = PHI %3091, %bb.0, %8001, %bb.1
-    %5092:sgpr_32 = PHI %3092, %bb.0, %8001, %bb.1
-    %5093:sgpr_32 = PHI %3093, %bb.0, %8001, %bb.1
-    %5094:sgpr_32 = PHI %3094, %bb.0, %8001, %bb.1
-    %5095:sgpr_32 = PHI %3095, %bb.0, %8001, %bb.1
-    %5096:sgpr_32 = PHI %3096, %bb.0, %8001, %bb.1
-    %5097:sgpr_32 = PHI %3097, %bb.0, %8001, %bb.1
-    %5098:sgpr_32 = PHI %3098, %bb.0, %8001, %bb.1
-    %5099:sgpr_32 = PHI %3099, %bb.0, %8001, %bb.1
-
-
-    %3:vgpr_32 = IMPLICIT_DEF
-
-    %6000:vgpr_32 = V_MOV_B32_e32 %5000, implicit $exec
-    %6001:vgpr_32 = V_MOV_B32_e32 %5001, implicit $exec
-    %6002:vgpr_32 = V_MOV_B32_e32 %5002, implicit $exec
-    %6003:vgpr_32 = V_MOV_B32_e32 %5003, implicit $exec
-    %6004:vgpr_32 = V_MOV_B32_e32 %5004, implicit $exec
-    %6005:vgpr_32 = V_MOV_B32_e32 %5005, implicit $exec
-    %6006:vgpr_32 = V_MOV_B32_e32 %5006, implicit $exec
-    %6007:vgpr_32 = V_MOV_B32_e32 %5007, implicit $exec
-    %6008:vgpr_32 = V_MOV_B32_e32 %5008, implicit $exec
-    %6009:vgpr_32 = V_MOV_B32_e32 %5009, implicit $exec
-    %6010:vgpr_32 = V_MOV_B32_e32 %5010, implicit $exec
-    %6011:vgpr_32 = V_MOV_B32_e32 %5011, implicit $exec
-    %6012:vgpr_32 = V_MOV_B32_e32 %5012, implicit $exec
-    %6013:vgpr_32 = V_MOV_B32_e32 %5013, implicit $exec
-    %6014:vgpr_32 = V_MOV_B32_e32 %5014, implicit $exec
-    %6015:vgpr_32 = V_MOV_B32_e32 %5015, implicit $exec
-    %6016:vgpr_32 = V_MOV_B32_e32 %5016, implicit $exec
-    %6017:vgpr_32 = V_MOV_B32_e32 %5017, implicit $exec
-    %6018:vgpr_32 = V_MOV_B32_e32 %5018, implicit $exec
-    %6019:vgpr_32 = V_MOV_B32_e32 %5019, implicit $exec
-    %6020:vgpr_32 = V_MOV_B32_e32 %5020, implicit $exec
-    %6021:vgpr_32 = V_MOV_B32_e32 %5021, implicit $exec
-    %6022:vgpr_32 = V_MOV_B32_e32 %5022, implicit $exec
-    %6023:vgpr_32 = V_MOV_B32_e32 %5023, implicit $exec
-    %6024:vgpr_32 = V_MOV_B32_e32 %5024, implicit $exec
-    %6025:vgpr_32 = V_MOV_B32_e32 %5025, implicit $exec
-    %6026:vgpr_32 = V_MOV_B32_e32 %5026, implicit $exec
-    %6027:vgpr_32 = V_MOV_B32_e32 %5027, implicit $exec
-    %6028:vgpr_32 = V_MOV_B32_e32 %5028, implicit $exec
-    %6029:vgpr_32 = V_MOV_B32_e32 %5029, implicit $exec
-    %6030:vgpr_32 = V_MOV_B32_e32 %5030, implicit $exec
-    %6031:vgpr_32 = V_MOV_B32_e32 %5031, implicit $exec
-    %6032:vgpr_32 = V_MOV_B32_e32 %5032, implicit $exec
-    %6033:vgpr_32 = V_MOV_B32_e32 %5033, implicit $exec
-    %6034:vgpr_32 = V_MOV_B32_e32 %5034, implicit $exec
-    %6035:vgpr_32 = V_MOV_B32_e32 %5035, implicit $exec
-    %6036:vgpr_32 = V_MOV_B32_e32 %5036, implicit $exec
-    %6037:vgpr_32 = V_MOV_B32_e32 %5037, implicit $exec
-    %6038:vgpr_32 = V_MOV_B32_e32 %5038, implicit $exec
-    %6039:vgpr_32 = V_MOV_B32_e32 %5039, implicit $exec
-    %6040:vgpr_32 = V_MOV_B32_e32 %5040, implicit $exec
-    %6041:vgpr_32 = V_MOV_B32_e32 %5041, implicit $exec
-    %6042:vgpr_32 = V_MOV_B32_e32 %5042, implicit $exec
-    %6043:vgpr_32 = V_MOV_B32_e32 %5043, implicit $exec
-    %6044:vgpr_32 = V_MOV_B32_e32 %5044, implicit $exec
-    %6045:vgpr_32 = V_MOV_B32_e32 %5045, implicit $exec
-    %6046:vgpr_32 = V_MOV_B32_e32 %5046, implicit $exec
-    %6047:vgpr_32 = V_MOV_B32_e32 %5047, implicit $exec
-    %6048:vgpr_32 = V_MOV_B32_e32 %5048, implicit $exec
-    %6049:vgpr_32 = V_MOV_B32_e32 %5049, implicit $exec
-    %6050:vgpr_32 = V_MOV_B32_e32 %5050, implicit $exec
-    %6051:vgpr_32 = V_MOV_B32_e32 %5051, implicit $exec
-    %6052:vgpr_32 = V_MOV_B32_e32 %5052, implicit $exec
-    %6053:vgpr_32 = V_MOV_B32_e32 %5053, implicit $exec
-    %6054:vgpr_32 = V_MOV_B32_e32 %5054, implicit $exec
-    %6055:vgpr_32 = V_MOV_B32_e32 %5055, implicit $exec
-    %6056:vgpr_32 = V_MOV_B32_e32 %5056, implicit $exec
-    %6057:vgpr_32 = V_MOV_B32_e32 %5057, implicit $exec
-    %6058:vgpr_32 = V_MOV_B32_e32 %5058, implicit $exec
-    %6059:vgpr_32 = V_MOV_B32_e32 %5059, implicit $exec
-    %6060:vgpr_32 = V_MOV_B32_e32 %5060, implicit $exec
-    %6061:vgpr_32 = V_MOV_B32_e32 %5061, implicit $exec
-    %6062:vgpr_32 = V_MOV_B32_e32 %5062, implicit $exec
-    %6063:vgpr_32 = V_MOV_B32_e32 %5063, implicit $exec
-    %6064:vgpr_32 = V_MOV_B32_e32 %5064, implicit $exec
-    %6065:vgpr_32 = V_MOV_B32_e32 %5065, implicit $exec
-    %6066:vgpr_32 = V_MOV_B32_e32 %5066, implicit $exec
-    %6067:vgpr_32 = V_MOV_B32_e32 %5067, implicit $exec
-    %6068:vgpr_32 = V_MOV_B32_e32 %5068, implicit $exec
-    %6069:vgpr_32 = V_MOV_B32_e32 %5069, implicit $exec
-    %6070:vgpr_32 = V_MOV_B32_e32 %5070, implicit $exec
-    %6071:vgpr_32 = V_MOV_B32_e32 %5071, implicit $exec
-    %6072:vgpr_32 = V_MOV_B32_e32 %5072, implicit $exec
-    %6073:vgpr_32 = V_MOV_B32_e32 %5073, implicit $exec
-    %6074:vgpr_32 = V_MOV_B32_e32 %5074, implicit $exec
-    %6075:vgpr_32 = V_MOV_B32_e32 %5075, implicit $exec
-    %6076:vgpr_32 = V_MOV_B32_e32 %5076, implicit $exec
-    %6077:vgpr_32 = V_MOV_B32_e32 %5077, implicit $exec
-    %6078:vgpr_32 = V_MOV_B32_e32 %5078, implicit $exec
-    %6079:vgpr_32 = V_MOV_B32_e32 %5079, implicit $exec
-    %6080:vgpr_32 = V_MOV_B32_e32 %5080, implicit $exec
-    %6081:vgpr_32 = V_MOV_B32_e32 %5081, implicit $exec
-    %6082:vgpr_32 = V_MOV_B32_e32 %5082, implicit $exec
-    %6083:vgpr_32 = V_MOV_B32_e32 %5083, implicit $exec
-    %6084:vgpr_32 = V_MOV_B32_e32 %5084, implicit $exec
-    %6085:vgpr_32 = V_MOV_B32_e32 %5085, implicit $exec
-    %6086:vgpr_32 = V_MOV_B32_e32 %5086, implicit $exec
-    %6087:vgpr_32 = V_MOV_B32_e32 %5087, implicit $exec
-    %6088:vgpr_32 = V_MOV_B32_e32 %5088, implicit $exec
-    %6089:vgpr_32 = V_MOV_B32_e32 %5089, implicit $exec
-    %6090:vgpr_32 = V_MOV_B32_e32 %5090, implicit $exec
-    %6091:vgpr_32 = V_MOV_B32_e32 %5091, implicit $exec
-    %6092:vgpr_32 = V_MOV_B32_e32 %5092, implicit $exec
-    %6093:vgpr_32 = V_MOV_B32_e32 %5093, implicit $exec
-    %6094:vgpr_32 = V_MOV_B32_e32 %5094, implicit $exec
-    %6095:vgpr_32 = V_MOV_B32_e32 %5095, implicit $exec
-    %6096:vgpr_32 = V_MOV_B32_e32 %5096, implicit $exec
-    %6097:vgpr_32 = V_MOV_B32_e32 %5097, implicit $exec
-    %6098:vgpr_32 = V_MOV_B32_e32 %5098, implicit $exec
-    %6099:vgpr_32 = V_MOV_B32_e32 %5099, implicit $exec
-    EXP 0, %6000, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6001, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6002, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6003, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6004, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6005, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6006, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6007, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6008, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6009, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6010, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6011, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6012, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6013, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6014, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6015, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6016, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6017, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6018, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6019, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6020, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6021, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6022, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6023, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6024, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6025, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6026, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6027, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6028, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6029, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6030, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6031, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6032, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6033, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6034, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6035, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6036, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6037, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6038, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6039, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6040, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6041, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6042, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6043, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6044, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6045, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6046, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6047, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6048, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6049, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6050, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6051, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6052, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6053, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6054, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6055, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6056, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6057, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6058, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6059, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6060, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6061, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6062, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6063, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6064, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6065, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6066, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6067, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6068, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6069, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6070, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6071, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6072, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6073, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6074, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6075, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6076, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6077, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6078, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6079, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6080, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6081, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6082, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6083, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6084, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6085, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6086, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6087, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6088, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6089, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6090, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6091, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6092, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6093, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6094, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6095, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6096, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6097, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6098, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, %6099, %3, %3, %3, -1, -1, 15, implicit $exec
-
-
-    S_ENDPGM 0
-...
-    
diff --git a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
deleted file mode 100644
index 02a9836313360..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/remat/reduce_lane_sgpr.mir
+++ /dev/null
@@ -1,565 +0,0 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-remat-enable-hot-block-remat-aggressive -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
-
-# Check that the buffer loads have been moved to the use and the lanes are reduced
-# correctly.
-#
-# CHECK: bb.2:
-#==========================================================================
-# X4_IMM, Using .x
-# CHECK: %[[#reg0:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 0, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 0, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg0]], %{{.+}}, 4, 0
-# X4_IMM, Using .xy
-# CHECK: %[[#reg1:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 16, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub0, %{{.+}}, 16, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg1]].sub1, %{{.+}}, 20, 0
-# X4_IMM, Using .xyz
-# CHECK: %[[#reg2:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub0, %{{.+}}, 32, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub1, %{{.+}}, 36, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg2]].sub2, %{{.+}}, 40, 0
-# X4_IMM, Using .yz
-# CHECK: %[[#reg3:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub1, %{{.+}}, 48, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg3]].sub2, %{{.+}}, 52, 0
-# X4_IMM, Using .yzw
-# CHECK: %[[#reg4:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub1, %{{.+}}, 64, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub2, %{{.+}}, 68, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg4]].sub3, %{{.+}}, 72, 0
-#==========================================================================
-# X8_IMM, Using .x
-# CHECK: %[[#reg5:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %{{.+}}, 80, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 80, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg5]], %{{.+}}, 84, 0
-# X8_IMM, Using .xy
-# CHECK: %[[#reg6:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %{{.+}}, 96, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub0, %{{.+}}, 96, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg6]].sub1, %{{.+}}, 100, 0
-# X8_IMM, Using .xyz
-# CHECK: %[[#reg7:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub0, %{{.+}}, 112, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub1, %{{.+}}, 116, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg7]].sub2, %{{.+}}, 120, 0
-# X8_IMM, Using .xyzw
-# CHECK: %[[#reg8:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub0, %{{.+}}, 128, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub1, %{{.+}}, 132, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub2, %{{.+}}, 136, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg8]].sub3, %{{.+}}, 140, 0
-# X8_IMM, Using .xyzw + 5th dword
-# CHECK: %[[#reg9:]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %{{.+}}, 144, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub0, %{{.+}}, 144, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub1, %{{.+}}, 148, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub2, %{{.+}}, 152, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub3, %{{.+}}, 156, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg9]].sub4, %{{.+}}, 160, 0
-#==========================================================================
-# X16_IMM, Using .xy and .zw
-# CHECK: %[[#reg10:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
-# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub0_sub1, %{{.+}}, 160, 0
-# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg10]].sub2_sub3, %{{.+}}, 164, 0
-#==========================================================================
-# X4_SGPR, Using .x
-# CHECK: %[[#reg11:]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %{{.+}}, %{{.+}}, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 176, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg11]], %{{.+}}, 180, 0
-# X8_SGPR, Using .xy
-# CHECK: %[[#reg12:]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR %{{.+}}, %{{.+}}, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub0, %{{.+}}, 192, 0
-# CHECK: S_BUFFER_STORE_DWORD_IMM %[[#reg12]].sub1, %{{.+}}, 196, 0
-# X16_SGPR, Using .xy + .zw
-# CHECK: %[[#reg13:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %{{.+}}, %{{.+}}, 0
-# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub0_sub1, %{{.+}}, 208, 0
-# CHECK: S_BUFFER_STORE_DWORDX2_IMM %[[#reg13]].sub2_sub3, %{{.+}}, 216, 0
-#==========================================================================
-#
-#
-# CHECK: %[[#reg14:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
-# CHECK: %[[#reg15:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
-# CHECK: %[[#reg16:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
-# CHECK: %[[#reg17:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
-# CHECK: %[[#reg18:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
-# CHECK: %[[#reg19:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
-# CHECK: %[[#reg20:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
-# CHECK: %[[#reg21:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
-# CHECK: %[[#reg22:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
-# CHECK: %[[#reg23:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
-# CHECK: %[[#reg24:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
-# CHECK: %[[#reg25:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
-# CHECK: %[[#reg26:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
-# CHECK: %[[#reg27:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
-# CHECK: %[[#reg28:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
-# CHECK: %[[#reg29:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
-# CHECK: %[[#reg30:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
-# CHECK: %[[#reg31:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
-# CHECK: %[[#reg32:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
-# CHECK: %[[#reg33:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
-# CHECK: %[[#reg34:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
-# CHECK: %[[#reg35:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
-# CHECK: %[[#reg36:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
-# CHECK: %[[#reg37:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
-# CHECK: %[[#reg38:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
-# CHECK: %[[#reg39:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
-# CHECK: %[[#reg40:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
-# CHECK: %[[#reg41:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
-# CHECK: %[[#reg42:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
-# CHECK: %[[#reg43:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
-# CHECK: %[[#reg44:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
-# CHECK: %[[#reg45:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
-# CHECK: %[[#reg46:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
-# CHECK: %[[#reg47:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
-# CHECK: %[[#reg48:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
-# CHECK: %[[#reg49:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
-# CHECK: %[[#reg50:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
-# CHECK: %[[#reg51:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
-# CHECK: %[[#reg52:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
-# CHECK: %[[#reg53:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
-# CHECK: %[[#reg54:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
-# CHECK: %[[#reg55:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
-# CHECK: %[[#reg56:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
-# CHECK: %[[#reg57:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
-# CHECK: %[[#reg58:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
-# CHECK: %[[#reg59:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
-# CHECK: %[[#reg60:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
-# CHECK: %[[#reg61:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
-# CHECK: %[[#reg62:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
-# CHECK: %[[#reg63:]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
-
-
---- |
-  source_filename = ".\main.ll"
-  define amdgpu_ps void @main() #1 {
-    ret void
-  }
-  attributes #1 = { "target-cpu"="gfx1010" }
-  !llvm.ident = !{!0}
-  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
-...
----
-name:            main
-tracksRegLiveness: true
-liveins:
-  - { reg: '$sgpr0' }
-  - { reg: '$sgpr1' }
-  - { reg: '$sgpr2' }
-  - { reg: '$sgpr3' }
-  - { reg: '$sgpr4' }
-  - { reg: '$sgpr5' }
-  - { reg: '$sgpr6' }
-  - { reg: '$sgpr7' }
-  - { reg: '$sgpr8' }
-  - { reg: '$sgpr8' }
-  - { reg: '$vgpr0' }
-  - { reg: '$vgpr1' }
-body:             |
-  bb.0:
-    successors: %bb.1, %bb.2
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1
-
-    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
-    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
-    %2:sgpr_128 = REG_SEQUENCE $sgpr8, %subreg.sub0, $sgpr9, %subreg.sub1, $sgpr10, %subreg.sub2, $sgpr11, %subreg.sub3
-
-    ; X4_IMM
-    %3000:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 0, 0
-    %3001:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 16, 0
-    %3002:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 32, 0
-    %3003:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 48, 0
-    %3004:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 64, 0
-
-    ; X8_IMM
-    %3005:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 80, 0
-    %3006:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 96, 0
-    %3007:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 112, 0
-    %3008:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 128, 0
-    %3009:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM %2:sgpr_128, 144, 0
-
-    ; X16_IMM
-    %30010:sgpr_512 = S_BUFFER_LOAD_DWORDX16_IMM %2:sgpr_128, 160, 0
-
-    ; X4_SGPR
-    %50:sgpr_32 = COPY $sgpr0
-    %30011:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR %2:sgpr_128, %50, 0
-
-    ; X8_SGPR
-    %51:sgpr_32 = COPY $sgpr1
-    %30012:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR %2:sgpr_128, %51, 0
-
-    ; X16_SGPR
-    %52:sgpr_32 = COPY $sgpr2
-    %30013:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR %2:sgpr_128, %52, 0
-
-    %30014:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 224, 0
-    %30015:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 240, 0
-    %30016:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 256, 0
-    %30017:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 272, 0
-    %30018:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 288, 0
-    %30019:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 304, 0
-    %30020:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 320, 0
-    %30021:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 336, 0
-    %30022:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 352, 0
-    %30023:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 368, 0
-    %30024:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 384, 0
-    %30025:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 400, 0
-    %30026:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 416, 0
-    %30027:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 432, 0
-    %30028:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 448, 0
-    %30029:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 464, 0
-    %30030:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 480, 0
-    %30031:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 496, 0
-    %30032:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 512, 0
-    %30033:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 528, 0
-    %30034:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 544, 0
-    %30035:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 560, 0
-    %30036:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 576, 0
-    %30037:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 592, 0
-    %30038:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 608, 0
-    %30039:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 624, 0
-    %30040:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 640, 0
-    %30041:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 656, 0
-    %30042:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 672, 0
-    %30043:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 688, 0
-    %30044:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 704, 0
-    %30045:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 720, 0
-    %30046:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 736, 0
-    %30047:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 752, 0
-    %30048:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 768, 0
-    %30049:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 784, 0
-    %30050:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 800, 0
-    %30051:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 816, 0
-    %30052:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 832, 0
-    %30053:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 848, 0
-    %30054:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 864, 0
-    %30055:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 880, 0
-    %30056:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 896, 0
-    %30057:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 912, 0
-    %30058:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 928, 0
-    %30059:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 944, 0
-    %30060:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 960, 0
-    %30061:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 976, 0
-    %30062:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 992, 0
-    %30063:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %2:sgpr_128, 1008, 0
-
-    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-
-
-    %8000:vgpr_32 = IMPLICIT_DEF
-    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
-    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
-    S_CBRANCH_EXECZ %bb.2, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:  
-    successors: %bb.2
-    %8001:vgpr_32 = COPY %8000
-    S_BRANCH %bb.2
-
-  bb.2:
-
-    %3:vgpr_32 = IMPLICIT_DEF
-    ;==========================================================================
-    ; X4_IMM, Using .x
-    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 0, 0
-    S_BUFFER_STORE_DWORD_IMM %3000.sub0, %1:sgpr_128, 4, 0 ; Do it a second time, since the lane reduction triggers on clone, and clone only happens when there are multiple uses.
-
-    ; X4_IMM, Using .xy
-    S_BUFFER_STORE_DWORD_IMM %3001.sub0, %1:sgpr_128, 16, 0
-    S_BUFFER_STORE_DWORD_IMM %3001.sub1, %1:sgpr_128, 20, 0
-
-    ; X4_IMM, Using .xyz
-    S_BUFFER_STORE_DWORD_IMM %3002.sub0, %1:sgpr_128, 32, 0
-    S_BUFFER_STORE_DWORD_IMM %3002.sub1, %1:sgpr_128, 36, 0
-    S_BUFFER_STORE_DWORD_IMM %3002.sub2, %1:sgpr_128, 40, 0
-
-    ; X4_IMM, Using .yz
-    S_BUFFER_STORE_DWORD_IMM %3003.sub1, %1:sgpr_128, 48, 0
-    S_BUFFER_STORE_DWORD_IMM %3003.sub2, %1:sgpr_128, 52, 0
-
-    ; X4_IMM, Using .yzw
-    S_BUFFER_STORE_DWORD_IMM %3004.sub1, %1:sgpr_128, 64, 0
-    S_BUFFER_STORE_DWORD_IMM %3004.sub2, %1:sgpr_128, 68, 0
-    S_BUFFER_STORE_DWORD_IMM %3004.sub3, %1:sgpr_128, 72, 0
-
-    ;==========================================================================
-    ; X8_IMM, Using .x
-    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 80, 0
-    S_BUFFER_STORE_DWORD_IMM %3005.sub0, %1:sgpr_128, 84, 0
-    
-    ; X8_IMM, Using .xy
-    S_BUFFER_STORE_DWORD_IMM %3006.sub0, %1:sgpr_128, 96, 0
-    S_BUFFER_STORE_DWORD_IMM %3006.sub1, %1:sgpr_128, 100, 0
-
-    ; X8_IMM, Using .xyz
-    S_BUFFER_STORE_DWORD_IMM %3007.sub0, %1:sgpr_128, 112, 0
-    S_BUFFER_STORE_DWORD_IMM %3007.sub1, %1:sgpr_128, 116, 0
-    S_BUFFER_STORE_DWORD_IMM %3007.sub2, %1:sgpr_128, 120, 0
-
-    ; X8_IMM, Using .xyzw
-    S_BUFFER_STORE_DWORD_IMM %3008.sub0, %1:sgpr_128, 128, 0
-    S_BUFFER_STORE_DWORD_IMM %3008.sub1, %1:sgpr_128, 132, 0
-    S_BUFFER_STORE_DWORD_IMM %3008.sub2, %1:sgpr_128, 136, 0
-    S_BUFFER_STORE_DWORD_IMM %3008.sub3, %1:sgpr_128, 140, 0
-    
-    ; X8_IMM, Using .xyzw + 5th dword
-    S_BUFFER_STORE_DWORD_IMM %3009.sub0, %1:sgpr_128, 144, 0
-    S_BUFFER_STORE_DWORD_IMM %3009.sub1, %1:sgpr_128, 148, 0
-    S_BUFFER_STORE_DWORD_IMM %3009.sub2, %1:sgpr_128, 152, 0
-    S_BUFFER_STORE_DWORD_IMM %3009.sub3, %1:sgpr_128, 156, 0
-    S_BUFFER_STORE_DWORD_IMM %3009.sub4, %1:sgpr_128, 160, 0
-
-    ;==========================================================================
-    ; X16_IMM, Using .xy and .zw
-    S_BUFFER_STORE_DWORDX2_IMM %30010.sub0_sub1, %1:sgpr_128, 160, 0
-    S_BUFFER_STORE_DWORDX2_IMM %30010.sub2_sub3, %1:sgpr_128, 164, 0
-
-    ;==========================================================================
-    ; X4_SGPR, Using .x
-    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 176, 0
-    S_BUFFER_STORE_DWORD_IMM %30011.sub0, %1:sgpr_128, 180, 0
-
-    ; X8_SGPR, Using .xy
-    S_BUFFER_STORE_DWORD_IMM %30012.sub0, %1:sgpr_128, 192, 0
-    S_BUFFER_STORE_DWORD_IMM %30012.sub1, %1:sgpr_128, 196, 0
-
-    ; X16_SGPR, Using .xy + .zw
-    S_BUFFER_STORE_DWORDX2_IMM %30013.sub0_sub1, %1:sgpr_128, 208, 0
-    S_BUFFER_STORE_DWORDX2_IMM %30013.sub2_sub3, %1:sgpr_128, 216, 0
-
-    ;==========================================================================
-    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
-
-    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
-
-
-    S_ENDPGM 0
-...
-
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index 69875261b74e9..d6c6173cd523e 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -1,452 +1,179 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
 
 # Check that the loads have been moved to the use
+# CHECK: bb.0:
+# CHECK-NOT: S_LOAD_DWORDX4_IMM
 # CHECK: bb.2:
-# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
-# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
-# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
-# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
-# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
-# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
-# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
-# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
-# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
-# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
-# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
-# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
-# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
-# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
-# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
-# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
-# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
-# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
-# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
-# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
-# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
-# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
-# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
-# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
-# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
-# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
-# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
-# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
-# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
-# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
-# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
-# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
-# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
-# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
-# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
-# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
-# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
-# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
-# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
-# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
-# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
-# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
-# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
-# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
-# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
-# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
-# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
-# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
-# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
-# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
-# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
-# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
-# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
-# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
-# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
-# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
-# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
-# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
-# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
-# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
-# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
-# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
-# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
-# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
-# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+# CHECK: %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0
+# CHECK: KILL %t0
+# CHECK: %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0
+# CHECK: KILL %t2
+# CHECK: %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0
+# CHECK: KILL %t4
+# CHECK: %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0
+# CHECK: KILL %t6
+# CHECK: %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0
+# CHECK: KILL %t8
+# CHECK: %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0
+# CHECK: KILL %t10
+# CHECK: %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0
+# CHECK: KILL %t12
+# CHECK: %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0
+# CHECK: KILL %t14
+# CHECK: %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0
+# CHECK: KILL %t16
+# CHECK: %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0
+# CHECK: KILL %t18
+# CHECK: %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0
+# CHECK: KILL %t20
+# CHECK: %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0
+# CHECK: KILL %t22
+# CHECK: %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0
+# CHECK: KILL %t24
+# CHECK: %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0
+# CHECK: KILL %t26
+# CHECK: %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0
+# CHECK: KILL %t28
+# CHECK: %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0
+# CHECK: KILL %t30
+# CHECK: %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0
+# CHECK: KILL %t32
+# CHECK: %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0
+# CHECK: KILL %t34
+# CHECK: %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0
+# CHECK: KILL %t36
+# CHECK: %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0
+# CHECK: KILL %t38
+# CHECK: %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0
+# CHECK: KILL %t40
+# CHECK: %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0
+# CHECK: KILL %t42
+# CHECK: %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0
+# CHECK: KILL %t44
+# CHECK: %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0
+# CHECK: KILL %t46
+# CHECK: %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0
+# CHECK: KILL %t48
+# CHECK: %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0
+# CHECK: KILL %t50
+# CHECK: %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0
+# CHECK: KILL %t52
+# CHECK: %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0
+# CHECK: KILL %t54
+# CHECK: %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0
+# CHECK: KILL %t56
+# CHECK: %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0
+# CHECK: KILL %t58
+# CHECK: %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0
+# CHECK: KILL %t60
+# CHECK: %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0
+# CHECK: KILL %t62
 
 
 --- |
-  source_filename = ".\main.ll"
-  define amdgpu_ps void @main() #1 {
+  define amdgpu_ps void @main() {
     ret void
   }
-  attributes #1 = { "target-cpu"="gfx1010" }
-  !llvm.ident = !{!0}
-  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
 ...
 ---
 name:            main
 tracksRegLiveness: true
-liveins:
-  - { reg: '$sgpr0' }
-  - { reg: '$sgpr1' }
-  - { reg: '$sgpr2' }
-  - { reg: '$sgpr3' }
-  - { reg: '$sgpr4' }
-  - { reg: '$sgpr5' }
-  - { reg: '$sgpr6' }
-  - { reg: '$sgpr7' }
-  - { reg: '$sgpr8' }
-  - { reg: '$sgpr8' }
-  - { reg: '$vgpr0' }
-  - { reg: '$vgpr1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
 
-    %0:sgpr_64 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1
-    ; undef %0.sub0:sgpr_64 = COPY $sgpr0
-    ; undef %0.sub1:sgpr_64 = COPY $sgpr1
+    %ptr:sgpr_64 = IMPLICIT_DEF
 
-    %1:sgpr_128 = REG_SEQUENCE $sgpr4, %subreg.sub0, $sgpr5, %subreg.sub1, $sgpr6, %subreg.sub2, $sgpr7, %subreg.sub3
-    ; undef %1.sub0:sgpr_128 = COPY $sgpr4
-    ; undef %1.sub1:sgpr_128 = COPY $sgpr5
-    ; undef %1.sub2:sgpr_128 = COPY $sgpr6
-    ; undef %1.sub3:sgpr_128 = COPY $sgpr7
+    ; Defs
+    %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0
+    %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0
+    %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0
+    %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0
+    %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0
+    %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0
+    %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0
+    %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0
+    %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0
+    %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0
+    %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0
+    %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0
+    %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0
+    %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0
+    %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0
+    %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0
+    %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0
+    %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0
+    %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0
+    %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0
+    %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0
+    %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0
+    %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0
+    %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0
+    %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0
+    %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0
+    %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0
+    %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0
+    %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0
+    %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0
+    %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0
+    %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0
 
-    %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
-    %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
-    %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
-    %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
-    %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
-    %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
-    %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
-    %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
-    %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
-    %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
-    %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
-    %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
-    %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
-    %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
-    %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
-    %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
-    %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
-    %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
-    %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
-    %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
-    %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
-    %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
-    %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
-    %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
-    %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
-    %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
-    %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
-    %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
-    %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
-    %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
-    %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
-    %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
-    %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
-    %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
-    %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
-    %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
-    %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
-    %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
-    %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
-    %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
-    %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
-    %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
-    %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
-    %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
-    %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
-    %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
-    %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
-    %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
-    %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
-    %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
-    %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
-    %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
-    %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
-    %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
-    %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
-    %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
-    %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
-    %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
-    %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
-    %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
-    %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
-    %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
-    %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
-    %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
 
-    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-
-
-    %8000:vgpr_32 = IMPLICIT_DEF
-    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
-    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
     S_CBRANCH_EXECZ %bb.2, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:  
     successors: %bb.2
-    %8001:vgpr_32 = COPY %8000
     S_BRANCH %bb.2
 
   bb.2:
-
-    %3:vgpr_32 = IMPLICIT_DEF
-    S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
-    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
-
-    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
-    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
 
 
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
     S_ENDPGM 0
 ...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir
new file mode 100644
index 0000000000000..a4e9c69d53b7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir
@@ -0,0 +1,575 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that when there are no safe spot to clone/move instructions that
+# modify $scc, a safe spot is created for it.
+
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# Save scc
+# CHECK: %[[#scc0:]]:sreg_32_xm0 = COPY $scc
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: KILL %t0
+# All subsequent moves are placed within the safe spot created for the first one.
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# Restore scc
+# CHECK: $scc = COPY %[[#scc0]]
+# CHECK: KILL %t2
+# CHECK: KILL %t4
+# CHECK: KILL %t6
+# CHECK: KILL %t8
+# CHECK: KILL %t10
+# CHECK: KILL %t12
+# CHECK: KILL %t14
+# CHECK: KILL %t16
+# CHECK: KILL %t18
+# CHECK: KILL %t20
+# CHECK: KILL %t22
+# CHECK: KILL %t24
+# CHECK: KILL %t26
+# CHECK: KILL %t28
+# CHECK: KILL %t30
+# CHECK: KILL %t32
+# CHECK: KILL %t34
+# CHECK: KILL %t36
+# CHECK: KILL %t38
+# CHECK: KILL %t40
+# CHECK: KILL %t42
+# CHECK: KILL %t44
+# CHECK: KILL %t46
+# CHECK: KILL %t48
+# CHECK: KILL %t50
+# CHECK: KILL %t52
+# CHECK: KILL %t54
+# CHECK: KILL %t56
+# CHECK: KILL %t58
+# CHECK: KILL %t60
+# CHECK: KILL %t62
+# CHECK: KILL %t64
+# CHECK: KILL %t66
+# CHECK: KILL %t68
+# CHECK: KILL %t70
+# CHECK: KILL %t72
+# CHECK: KILL %t74
+# CHECK: KILL %t76
+# CHECK: KILL %t78
+# CHECK: KILL %t80
+# CHECK: KILL %t82
+# CHECK: KILL %t84
+# CHECK: KILL %t86
+# CHECK: KILL %t88
+# CHECK: KILL %t90
+# CHECK: KILL %t92
+# CHECK: KILL %t94
+# CHECK: KILL %t96
+# CHECK: KILL %t98
+# CHECK: KILL %t100
+# CHECK: KILL %t102
+# CHECK: KILL %t104
+# CHECK: KILL %t106
+# CHECK: KILL %t108
+# CHECK: KILL %t110
+# CHECK: KILL %t112
+# CHECK: KILL %t114
+# CHECK: KILL %t116
+# CHECK: KILL %t118
+# CHECK: KILL %t120
+# CHECK: KILL %t122
+# CHECK: KILL %t124
+# CHECK: KILL %t126
+# CHECK: KILL %t128
+# CHECK: KILL %t130
+# CHECK: KILL %t132
+# CHECK: KILL %t134
+# CHECK: KILL %t136
+# CHECK: KILL %t138
+# CHECK: KILL %t140
+# CHECK: KILL %t142
+# CHECK: KILL %t144
+# CHECK: KILL %t146
+# CHECK: KILL %t148
+# CHECK: KILL %t150
+# CHECK: KILL %t152
+# CHECK: KILL %t154
+# CHECK: KILL %t156
+# CHECK: KILL %t158
+# CHECK: KILL %t160
+# CHECK: KILL %t162
+# CHECK: KILL %t164
+# CHECK: KILL %t166
+# CHECK: KILL %t168
+# CHECK: KILL %t170
+# CHECK: KILL %t172
+# CHECK: KILL %t174
+# CHECK: KILL %t176
+# CHECK: KILL %t178
+# CHECK: KILL %t180
+# CHECK: KILL %t182
+# CHECK: KILL %t184
+# CHECK: KILL %t186
+# CHECK: KILL %t188
+# CHECK: KILL %t190
+# CHECK: KILL %t192
+# CHECK: KILL %t194
+# CHECK: KILL %t196
+# CHECK: KILL %t198
+# CHECK: KILL %t200
+# CHECK: KILL %t202
+# CHECK: KILL %t204
+# CHECK: KILL %t206
+# CHECK: KILL %t208
+# CHECK: KILL %t210
+# CHECK: KILL %t212
+# CHECK: KILL %t214
+# CHECK: KILL %t216
+# CHECK: KILL %t218
+# CHECK: KILL %t220
+# CHECK: KILL %t222
+# CHECK: KILL %t224
+# CHECK: KILL %t226
+# CHECK: KILL %t228
+# CHECK: KILL %t230
+# CHECK: KILL %t232
+# CHECK: KILL %t234
+# CHECK: KILL %t236
+# CHECK: KILL %t238
+# CHECK: KILL %t240
+# CHECK: KILL %t242
+# CHECK: KILL %t244
+# CHECK: KILL %t246
+# CHECK: KILL %t248
+# CHECK: KILL %t250
+# CHECK: KILL %t252
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Def scc
+    $scc = IMPLICIT_DEF
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    liveins: $scc
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    liveins: $scc
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+    KILL $scc
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir
new file mode 100644
index 0000000000000..39d21dbda3819
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir
@@ -0,0 +1,564 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that scalar instructions that define $scc are not sunk into ranges where $scc is live
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# CHECK: KILL %t0
+# CHECK: KILL %t2
+# CHECK: KILL %t4
+# CHECK: KILL %t6
+# CHECK: KILL %t8
+# CHECK: KILL %t10
+# CHECK: KILL %t12
+# CHECK: KILL %t14
+# CHECK: KILL %t16
+# CHECK: KILL %t18
+# CHECK: KILL %t20
+# CHECK: KILL %t22
+# CHECK: KILL %t24
+# CHECK: KILL %t26
+# CHECK: KILL %t28
+# CHECK: KILL %t30
+# CHECK: KILL %t32
+# CHECK: KILL %t34
+# CHECK: KILL %t36
+# CHECK: KILL %t38
+# CHECK: KILL %t40
+# CHECK: KILL %t42
+# CHECK: KILL %t44
+# CHECK: KILL %t46
+# CHECK: KILL %t48
+# CHECK: KILL %t50
+# CHECK: KILL %t52
+# CHECK: KILL %t54
+# CHECK: KILL %t56
+# CHECK: KILL %t58
+# CHECK: KILL %t60
+# CHECK: KILL %t62
+# CHECK: KILL %t64
+# CHECK: KILL %t66
+# CHECK: KILL %t68
+# CHECK: KILL %t70
+# CHECK: KILL %t72
+# CHECK: KILL %t74
+# CHECK: KILL %t76
+# CHECK: KILL %t78
+# CHECK: KILL %t80
+# CHECK: KILL %t82
+# CHECK: KILL %t84
+# CHECK: KILL %t86
+# CHECK: KILL %t88
+# CHECK: KILL %t90
+# CHECK: KILL %t92
+# CHECK: KILL %t94
+# CHECK: KILL %t96
+# CHECK: KILL %t98
+# CHECK: KILL %t100
+# CHECK: KILL %t102
+# CHECK: KILL %t104
+# CHECK: KILL %t106
+# CHECK: KILL %t108
+# CHECK: KILL %t110
+# CHECK: KILL %t112
+# CHECK: KILL %t114
+# CHECK: KILL %t116
+# CHECK: KILL %t118
+# CHECK: KILL %t120
+# CHECK: KILL %t122
+# CHECK: KILL %t124
+# CHECK: KILL %t126
+# CHECK: KILL %t128
+# CHECK: KILL %t130
+# CHECK: KILL %t132
+# CHECK: KILL %t134
+# CHECK: KILL %t136
+# CHECK: KILL %t138
+# CHECK: KILL %t140
+# CHECK: KILL %t142
+# CHECK: KILL %t144
+# CHECK: KILL %t146
+# CHECK: KILL %t148
+# CHECK: KILL %t150
+# CHECK: KILL %t152
+# CHECK: KILL %t154
+# CHECK: KILL %t156
+# CHECK: KILL %t158
+# CHECK: KILL %t160
+# CHECK: KILL %t162
+# CHECK: KILL %t164
+# CHECK: KILL %t166
+# CHECK: KILL %t168
+# CHECK: KILL %t170
+# CHECK: KILL %t172
+# CHECK: KILL %t174
+# CHECK: KILL %t176
+# CHECK: KILL %t178
+# CHECK: KILL %t180
+# CHECK: KILL %t182
+# CHECK: KILL %t184
+# CHECK: KILL %t186
+# CHECK: KILL %t188
+# CHECK: KILL %t190
+# CHECK: KILL %t192
+# CHECK: KILL %t194
+# CHECK: KILL %t196
+# CHECK: KILL %t198
+# CHECK: KILL %t200
+# CHECK: KILL %t202
+# CHECK: KILL %t204
+# CHECK: KILL %t206
+# CHECK: KILL %t208
+# CHECK: KILL %t210
+# CHECK: KILL %t212
+# CHECK: KILL %t214
+# CHECK: KILL %t216
+# CHECK: KILL %t218
+# CHECK: KILL %t220
+# CHECK: KILL %t222
+# CHECK: KILL %t224
+# CHECK: KILL %t226
+# CHECK: KILL %t228
+# CHECK: KILL %t230
+# CHECK: KILL %t232
+# CHECK: KILL %t234
+# CHECK: KILL %t236
+# CHECK: KILL %t238
+# CHECK: KILL %t240
+# CHECK: KILL %t242
+# CHECK: KILL %t244
+# CHECK: KILL %t246
+# CHECK: KILL %t248
+# CHECK: KILL %t250
+# CHECK: KILL %t252
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+    KILL $scc
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir
new file mode 100644
index 0000000000000..305bf87a6120e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir
@@ -0,0 +1,304 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are present
+# CHECK: S_ENDPGM
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %s0:sgpr_32 = IMPLICIT_DEF
+    S_BRANCH %bb.2
+
+  bb.2:
+    %phi0:sgpr_32 = PHI %t0, %bb.0, %s0, %bb.1
+    %phi2:sgpr_32 = PHI %t2, %bb.0, %s0, %bb.1
+    %phi4:sgpr_32 = PHI %t4, %bb.0, %s0, %bb.1
+    %phi6:sgpr_32 = PHI %t6, %bb.0, %s0, %bb.1
+    %phi8:sgpr_32 = PHI %t8, %bb.0, %s0, %bb.1
+    %phi10:sgpr_32 = PHI %t10, %bb.0, %s0, %bb.1
+    %phi12:sgpr_32 = PHI %t12, %bb.0, %s0, %bb.1
+    %phi14:sgpr_32 = PHI %t14, %bb.0, %s0, %bb.1
+    %phi16:sgpr_32 = PHI %t16, %bb.0, %s0, %bb.1
+    %phi18:sgpr_32 = PHI %t18, %bb.0, %s0, %bb.1
+    %phi20:sgpr_32 = PHI %t20, %bb.0, %s0, %bb.1
+    %phi22:sgpr_32 = PHI %t22, %bb.0, %s0, %bb.1
+    %phi24:sgpr_32 = PHI %t24, %bb.0, %s0, %bb.1
+    %phi26:sgpr_32 = PHI %t26, %bb.0, %s0, %bb.1
+    %phi28:sgpr_32 = PHI %t28, %bb.0, %s0, %bb.1
+    %phi30:sgpr_32 = PHI %t30, %bb.0, %s0, %bb.1
+    %phi32:sgpr_32 = PHI %t32, %bb.0, %s0, %bb.1
+    %phi34:sgpr_32 = PHI %t34, %bb.0, %s0, %bb.1
+    %phi36:sgpr_32 = PHI %t36, %bb.0, %s0, %bb.1
+    %phi38:sgpr_32 = PHI %t38, %bb.0, %s0, %bb.1
+    %phi40:sgpr_32 = PHI %t40, %bb.0, %s0, %bb.1
+    %phi42:sgpr_32 = PHI %t42, %bb.0, %s0, %bb.1
+    %phi44:sgpr_32 = PHI %t44, %bb.0, %s0, %bb.1
+    %phi46:sgpr_32 = PHI %t46, %bb.0, %s0, %bb.1
+    %phi48:sgpr_32 = PHI %t48, %bb.0, %s0, %bb.1
+    %phi50:sgpr_32 = PHI %t50, %bb.0, %s0, %bb.1
+    %phi52:sgpr_32 = PHI %t52, %bb.0, %s0, %bb.1
+    %phi54:sgpr_32 = PHI %t54, %bb.0, %s0, %bb.1
+    %phi56:sgpr_32 = PHI %t56, %bb.0, %s0, %bb.1
+    %phi58:sgpr_32 = PHI %t58, %bb.0, %s0, %bb.1
+    %phi60:sgpr_32 = PHI %t60, %bb.0, %s0, %bb.1
+    %phi62:sgpr_32 = PHI %t62, %bb.0, %s0, %bb.1
+    %phi64:sgpr_32 = PHI %t64, %bb.0, %s0, %bb.1
+    %phi66:sgpr_32 = PHI %t66, %bb.0, %s0, %bb.1
+    %phi68:sgpr_32 = PHI %t68, %bb.0, %s0, %bb.1
+    %phi70:sgpr_32 = PHI %t70, %bb.0, %s0, %bb.1
+    %phi72:sgpr_32 = PHI %t72, %bb.0, %s0, %bb.1
+    %phi74:sgpr_32 = PHI %t74, %bb.0, %s0, %bb.1
+    %phi76:sgpr_32 = PHI %t76, %bb.0, %s0, %bb.1
+    %phi78:sgpr_32 = PHI %t78, %bb.0, %s0, %bb.1
+    %phi80:sgpr_32 = PHI %t80, %bb.0, %s0, %bb.1
+    %phi82:sgpr_32 = PHI %t82, %bb.0, %s0, %bb.1
+    %phi84:sgpr_32 = PHI %t84, %bb.0, %s0, %bb.1
+    %phi86:sgpr_32 = PHI %t86, %bb.0, %s0, %bb.1
+    %phi88:sgpr_32 = PHI %t88, %bb.0, %s0, %bb.1
+    %phi90:sgpr_32 = PHI %t90, %bb.0, %s0, %bb.1
+    %phi92:sgpr_32 = PHI %t92, %bb.0, %s0, %bb.1
+    %phi94:sgpr_32 = PHI %t94, %bb.0, %s0, %bb.1
+    %phi96:sgpr_32 = PHI %t96, %bb.0, %s0, %bb.1
+    %phi98:sgpr_32 = PHI %t98, %bb.0, %s0, %bb.1
+    %phi100:sgpr_32 = PHI %t100, %bb.0, %s0, %bb.1
+    %phi102:sgpr_32 = PHI %t102, %bb.0, %s0, %bb.1
+    %phi104:sgpr_32 = PHI %t104, %bb.0, %s0, %bb.1
+    %phi106:sgpr_32 = PHI %t106, %bb.0, %s0, %bb.1
+    %phi108:sgpr_32 = PHI %t108, %bb.0, %s0, %bb.1
+    %phi110:sgpr_32 = PHI %t110, %bb.0, %s0, %bb.1
+    %phi112:sgpr_32 = PHI %t112, %bb.0, %s0, %bb.1
+    %phi114:sgpr_32 = PHI %t114, %bb.0, %s0, %bb.1
+    %phi116:sgpr_32 = PHI %t116, %bb.0, %s0, %bb.1
+    %phi118:sgpr_32 = PHI %t118, %bb.0, %s0, %bb.1
+    %phi120:sgpr_32 = PHI %t120, %bb.0, %s0, %bb.1
+    %phi122:sgpr_32 = PHI %t122, %bb.0, %s0, %bb.1
+    %phi124:sgpr_32 = PHI %t124, %bb.0, %s0, %bb.1
+    %phi126:sgpr_32 = PHI %t126, %bb.0, %s0, %bb.1
+    %phi128:sgpr_32 = PHI %t128, %bb.0, %s0, %bb.1
+    %phi130:sgpr_32 = PHI %t130, %bb.0, %s0, %bb.1
+    %phi132:sgpr_32 = PHI %t132, %bb.0, %s0, %bb.1
+    %phi134:sgpr_32 = PHI %t134, %bb.0, %s0, %bb.1
+    %phi136:sgpr_32 = PHI %t136, %bb.0, %s0, %bb.1
+    %phi138:sgpr_32 = PHI %t138, %bb.0, %s0, %bb.1
+    %phi140:sgpr_32 = PHI %t140, %bb.0, %s0, %bb.1
+    %phi142:sgpr_32 = PHI %t142, %bb.0, %s0, %bb.1
+    %phi144:sgpr_32 = PHI %t144, %bb.0, %s0, %bb.1
+    %phi146:sgpr_32 = PHI %t146, %bb.0, %s0, %bb.1
+    %phi148:sgpr_32 = PHI %t148, %bb.0, %s0, %bb.1
+    %phi150:sgpr_32 = PHI %t150, %bb.0, %s0, %bb.1
+    %phi152:sgpr_32 = PHI %t152, %bb.0, %s0, %bb.1
+    %phi154:sgpr_32 = PHI %t154, %bb.0, %s0, %bb.1
+    %phi156:sgpr_32 = PHI %t156, %bb.0, %s0, %bb.1
+    %phi158:sgpr_32 = PHI %t158, %bb.0, %s0, %bb.1
+    %phi160:sgpr_32 = PHI %t160, %bb.0, %s0, %bb.1
+    %phi162:sgpr_32 = PHI %t162, %bb.0, %s0, %bb.1
+    %phi164:sgpr_32 = PHI %t164, %bb.0, %s0, %bb.1
+    %phi166:sgpr_32 = PHI %t166, %bb.0, %s0, %bb.1
+    %phi168:sgpr_32 = PHI %t168, %bb.0, %s0, %bb.1
+    %phi170:sgpr_32 = PHI %t170, %bb.0, %s0, %bb.1
+    %phi172:sgpr_32 = PHI %t172, %bb.0, %s0, %bb.1
+    %phi174:sgpr_32 = PHI %t174, %bb.0, %s0, %bb.1
+    %phi176:sgpr_32 = PHI %t176, %bb.0, %s0, %bb.1
+    %phi178:sgpr_32 = PHI %t178, %bb.0, %s0, %bb.1
+    %phi180:sgpr_32 = PHI %t180, %bb.0, %s0, %bb.1
+    %phi182:sgpr_32 = PHI %t182, %bb.0, %s0, %bb.1
+    %phi184:sgpr_32 = PHI %t184, %bb.0, %s0, %bb.1
+    %phi186:sgpr_32 = PHI %t186, %bb.0, %s0, %bb.1
+    %phi188:sgpr_32 = PHI %t188, %bb.0, %s0, %bb.1
+    %phi190:sgpr_32 = PHI %t190, %bb.0, %s0, %bb.1
+    %phi192:sgpr_32 = PHI %t192, %bb.0, %s0, %bb.1
+    %phi194:sgpr_32 = PHI %t194, %bb.0, %s0, %bb.1
+    %phi196:sgpr_32 = PHI %t196, %bb.0, %s0, %bb.1
+    %phi198:sgpr_32 = PHI %t198, %bb.0, %s0, %bb.1
+    %phi200:sgpr_32 = PHI %t200, %bb.0, %s0, %bb.1
+    %phi202:sgpr_32 = PHI %t202, %bb.0, %s0, %bb.1
+    %phi204:sgpr_32 = PHI %t204, %bb.0, %s0, %bb.1
+    %phi206:sgpr_32 = PHI %t206, %bb.0, %s0, %bb.1
+    %phi208:sgpr_32 = PHI %t208, %bb.0, %s0, %bb.1
+    %phi210:sgpr_32 = PHI %t210, %bb.0, %s0, %bb.1
+    %phi212:sgpr_32 = PHI %t212, %bb.0, %s0, %bb.1
+    %phi214:sgpr_32 = PHI %t214, %bb.0, %s0, %bb.1
+    %phi216:sgpr_32 = PHI %t216, %bb.0, %s0, %bb.1
+    %phi218:sgpr_32 = PHI %t218, %bb.0, %s0, %bb.1
+    %phi220:sgpr_32 = PHI %t220, %bb.0, %s0, %bb.1
+    %phi222:sgpr_32 = PHI %t222, %bb.0, %s0, %bb.1
+    %phi224:sgpr_32 = PHI %t224, %bb.0, %s0, %bb.1
+    %phi226:sgpr_32 = PHI %t226, %bb.0, %s0, %bb.1
+    %phi228:sgpr_32 = PHI %t228, %bb.0, %s0, %bb.1
+    %phi230:sgpr_32 = PHI %t230, %bb.0, %s0, %bb.1
+    %phi232:sgpr_32 = PHI %t232, %bb.0, %s0, %bb.1
+    %phi234:sgpr_32 = PHI %t234, %bb.0, %s0, %bb.1
+    %phi236:sgpr_32 = PHI %t236, %bb.0, %s0, %bb.1
+    %phi238:sgpr_32 = PHI %t238, %bb.0, %s0, %bb.1
+    %phi240:sgpr_32 = PHI %t240, %bb.0, %s0, %bb.1
+    %phi242:sgpr_32 = PHI %t242, %bb.0, %s0, %bb.1
+    %phi244:sgpr_32 = PHI %t244, %bb.0, %s0, %bb.1
+    %phi246:sgpr_32 = PHI %t246, %bb.0, %s0, %bb.1
+    %phi248:sgpr_32 = PHI %t248, %bb.0, %s0, %bb.1
+    %phi250:sgpr_32 = PHI %t250, %bb.0, %s0, %bb.1
+    %phi252:sgpr_32 = PHI %t252, %bb.0, %s0, %bb.1
+    %phi254:sgpr_32 = PHI %t254, %bb.0, %s0, %bb.1
+
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir
new file mode 100644
index 0000000000000..94e86a61c09d6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir
@@ -0,0 +1,564 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s
+
+# This test checks that instructions that use $scc are sunk to users
+# CHECK: bb.0:
+# CHECK-NOT: S_NOT_B32:
+# CHECK: bb.2:
+# CHECK: %t0:sgpr_32 = S_NOT_B32 0
+# CHECK: KILL %t0
+# CHECK: %t2:sgpr_32 = S_NOT_B32 1
+# CHECK: KILL %t2
+# CHECK: %t4:sgpr_32 = S_NOT_B32 2
+# CHECK: KILL %t4
+# CHECK: %t6:sgpr_32 = S_NOT_B32 3
+# CHECK: KILL %t6
+# CHECK: %t8:sgpr_32 = S_NOT_B32 4
+# CHECK: KILL %t8
+# CHECK: %t10:sgpr_32 = S_NOT_B32 5
+# CHECK: KILL %t10
+# CHECK: %t12:sgpr_32 = S_NOT_B32 6
+# CHECK: KILL %t12
+# CHECK: %t14:sgpr_32 = S_NOT_B32 7
+# CHECK: KILL %t14
+# CHECK: %t16:sgpr_32 = S_NOT_B32 8
+# CHECK: KILL %t16
+# CHECK: %t18:sgpr_32 = S_NOT_B32 9
+# CHECK: KILL %t18
+# CHECK: %t20:sgpr_32 = S_NOT_B32 10
+# CHECK: KILL %t20
+# CHECK: %t22:sgpr_32 = S_NOT_B32 11
+# CHECK: KILL %t22
+# CHECK: %t24:sgpr_32 = S_NOT_B32 12
+# CHECK: KILL %t24
+# CHECK: %t26:sgpr_32 = S_NOT_B32 13
+# CHECK: KILL %t26
+# CHECK: %t28:sgpr_32 = S_NOT_B32 14
+# CHECK: KILL %t28
+# CHECK: %t30:sgpr_32 = S_NOT_B32 15
+# CHECK: KILL %t30
+# CHECK: %t32:sgpr_32 = S_NOT_B32 16
+# CHECK: KILL %t32
+# CHECK: %t34:sgpr_32 = S_NOT_B32 17
+# CHECK: KILL %t34
+# CHECK: %t36:sgpr_32 = S_NOT_B32 18
+# CHECK: KILL %t36
+# CHECK: %t38:sgpr_32 = S_NOT_B32 19
+# CHECK: KILL %t38
+# CHECK: %t40:sgpr_32 = S_NOT_B32 20
+# CHECK: KILL %t40
+# CHECK: %t42:sgpr_32 = S_NOT_B32 21
+# CHECK: KILL %t42
+# CHECK: %t44:sgpr_32 = S_NOT_B32 22
+# CHECK: KILL %t44
+# CHECK: %t46:sgpr_32 = S_NOT_B32 23
+# CHECK: KILL %t46
+# CHECK: %t48:sgpr_32 = S_NOT_B32 24
+# CHECK: KILL %t48
+# CHECK: %t50:sgpr_32 = S_NOT_B32 25
+# CHECK: KILL %t50
+# CHECK: %t52:sgpr_32 = S_NOT_B32 26
+# CHECK: KILL %t52
+# CHECK: %t54:sgpr_32 = S_NOT_B32 27
+# CHECK: KILL %t54
+# CHECK: %t56:sgpr_32 = S_NOT_B32 28
+# CHECK: KILL %t56
+# CHECK: %t58:sgpr_32 = S_NOT_B32 29
+# CHECK: KILL %t58
+# CHECK: %t60:sgpr_32 = S_NOT_B32 30
+# CHECK: KILL %t60
+# CHECK: %t62:sgpr_32 = S_NOT_B32 31
+# CHECK: KILL %t62
+# CHECK: %t64:sgpr_32 = S_NOT_B32 32
+# CHECK: KILL %t64
+# CHECK: %t66:sgpr_32 = S_NOT_B32 33
+# CHECK: KILL %t66
+# CHECK: %t68:sgpr_32 = S_NOT_B32 34
+# CHECK: KILL %t68
+# CHECK: %t70:sgpr_32 = S_NOT_B32 35
+# CHECK: KILL %t70
+# CHECK: %t72:sgpr_32 = S_NOT_B32 36
+# CHECK: KILL %t72
+# CHECK: %t74:sgpr_32 = S_NOT_B32 37
+# CHECK: KILL %t74
+# CHECK: %t76:sgpr_32 = S_NOT_B32 38
+# CHECK: KILL %t76
+# CHECK: %t78:sgpr_32 = S_NOT_B32 39
+# CHECK: KILL %t78
+# CHECK: %t80:sgpr_32 = S_NOT_B32 40
+# CHECK: KILL %t80
+# CHECK: %t82:sgpr_32 = S_NOT_B32 41
+# CHECK: KILL %t82
+# CHECK: %t84:sgpr_32 = S_NOT_B32 42
+# CHECK: KILL %t84
+# CHECK: %t86:sgpr_32 = S_NOT_B32 43
+# CHECK: KILL %t86
+# CHECK: %t88:sgpr_32 = S_NOT_B32 44
+# CHECK: KILL %t88
+# CHECK: %t90:sgpr_32 = S_NOT_B32 45
+# CHECK: KILL %t90
+# CHECK: %t92:sgpr_32 = S_NOT_B32 46
+# CHECK: KILL %t92
+# CHECK: %t94:sgpr_32 = S_NOT_B32 47
+# CHECK: KILL %t94
+# CHECK: %t96:sgpr_32 = S_NOT_B32 48
+# CHECK: KILL %t96
+# CHECK: %t98:sgpr_32 = S_NOT_B32 49
+# CHECK: KILL %t98
+# CHECK: %t100:sgpr_32 = S_NOT_B32 50
+# CHECK: KILL %t100
+# CHECK: %t102:sgpr_32 = S_NOT_B32 51
+# CHECK: KILL %t102
+# CHECK: %t104:sgpr_32 = S_NOT_B32 52
+# CHECK: KILL %t104
+# CHECK: %t106:sgpr_32 = S_NOT_B32 53
+# CHECK: KILL %t106
+# CHECK: %t108:sgpr_32 = S_NOT_B32 54
+# CHECK: KILL %t108
+# CHECK: %t110:sgpr_32 = S_NOT_B32 55
+# CHECK: KILL %t110
+# CHECK: %t112:sgpr_32 = S_NOT_B32 56
+# CHECK: KILL %t112
+# CHECK: %t114:sgpr_32 = S_NOT_B32 57
+# CHECK: KILL %t114
+# CHECK: %t116:sgpr_32 = S_NOT_B32 58
+# CHECK: KILL %t116
+# CHECK: %t118:sgpr_32 = S_NOT_B32 59
+# CHECK: KILL %t118
+# CHECK: %t120:sgpr_32 = S_NOT_B32 60
+# CHECK: KILL %t120
+# CHECK: %t122:sgpr_32 = S_NOT_B32 61
+# CHECK: KILL %t122
+# CHECK: %t124:sgpr_32 = S_NOT_B32 62
+# CHECK: KILL %t124
+# CHECK: %t126:sgpr_32 = S_NOT_B32 63
+# CHECK: KILL %t126
+# CHECK: %t128:sgpr_32 = S_NOT_B32 64
+# CHECK: KILL %t128
+# CHECK: %t130:sgpr_32 = S_NOT_B32 65
+# CHECK: KILL %t130
+# CHECK: %t132:sgpr_32 = S_NOT_B32 66
+# CHECK: KILL %t132
+# CHECK: %t134:sgpr_32 = S_NOT_B32 67
+# CHECK: KILL %t134
+# CHECK: %t136:sgpr_32 = S_NOT_B32 68
+# CHECK: KILL %t136
+# CHECK: %t138:sgpr_32 = S_NOT_B32 69
+# CHECK: KILL %t138
+# CHECK: %t140:sgpr_32 = S_NOT_B32 70
+# CHECK: KILL %t140
+# CHECK: %t142:sgpr_32 = S_NOT_B32 71
+# CHECK: KILL %t142
+# CHECK: %t144:sgpr_32 = S_NOT_B32 72
+# CHECK: KILL %t144
+# CHECK: %t146:sgpr_32 = S_NOT_B32 73
+# CHECK: KILL %t146
+# CHECK: %t148:sgpr_32 = S_NOT_B32 74
+# CHECK: KILL %t148
+# CHECK: %t150:sgpr_32 = S_NOT_B32 75
+# CHECK: KILL %t150
+# CHECK: %t152:sgpr_32 = S_NOT_B32 76
+# CHECK: KILL %t152
+# CHECK: %t154:sgpr_32 = S_NOT_B32 77
+# CHECK: KILL %t154
+# CHECK: %t156:sgpr_32 = S_NOT_B32 78
+# CHECK: KILL %t156
+# CHECK: %t158:sgpr_32 = S_NOT_B32 79
+# CHECK: KILL %t158
+# CHECK: %t160:sgpr_32 = S_NOT_B32 80
+# CHECK: KILL %t160
+# CHECK: %t162:sgpr_32 = S_NOT_B32 81
+# CHECK: KILL %t162
+# CHECK: %t164:sgpr_32 = S_NOT_B32 82
+# CHECK: KILL %t164
+# CHECK: %t166:sgpr_32 = S_NOT_B32 83
+# CHECK: KILL %t166
+# CHECK: %t168:sgpr_32 = S_NOT_B32 84
+# CHECK: KILL %t168
+# CHECK: %t170:sgpr_32 = S_NOT_B32 85
+# CHECK: KILL %t170
+# CHECK: %t172:sgpr_32 = S_NOT_B32 86
+# CHECK: KILL %t172
+# CHECK: %t174:sgpr_32 = S_NOT_B32 87
+# CHECK: KILL %t174
+# CHECK: %t176:sgpr_32 = S_NOT_B32 88
+# CHECK: KILL %t176
+# CHECK: %t178:sgpr_32 = S_NOT_B32 89
+# CHECK: KILL %t178
+# CHECK: %t180:sgpr_32 = S_NOT_B32 90
+# CHECK: KILL %t180
+# CHECK: %t182:sgpr_32 = S_NOT_B32 91
+# CHECK: KILL %t182
+# CHECK: %t184:sgpr_32 = S_NOT_B32 92
+# CHECK: KILL %t184
+# CHECK: %t186:sgpr_32 = S_NOT_B32 93
+# CHECK: KILL %t186
+# CHECK: %t188:sgpr_32 = S_NOT_B32 94
+# CHECK: KILL %t188
+# CHECK: %t190:sgpr_32 = S_NOT_B32 95
+# CHECK: KILL %t190
+# CHECK: %t192:sgpr_32 = S_NOT_B32 96
+# CHECK: KILL %t192
+# CHECK: %t194:sgpr_32 = S_NOT_B32 97
+# CHECK: KILL %t194
+# CHECK: %t196:sgpr_32 = S_NOT_B32 98
+# CHECK: KILL %t196
+# CHECK: %t198:sgpr_32 = S_NOT_B32 99
+# CHECK: KILL %t198
+# CHECK: %t200:sgpr_32 = S_NOT_B32 100
+# CHECK: KILL %t200
+# CHECK: %t202:sgpr_32 = S_NOT_B32 101
+# CHECK: KILL %t202
+# CHECK: %t204:sgpr_32 = S_NOT_B32 102
+# CHECK: KILL %t204
+# CHECK: %t206:sgpr_32 = S_NOT_B32 103
+# CHECK: KILL %t206
+# CHECK: %t208:sgpr_32 = S_NOT_B32 104
+# CHECK: KILL %t208
+# CHECK: %t210:sgpr_32 = S_NOT_B32 105
+# CHECK: KILL %t210
+# CHECK: %t212:sgpr_32 = S_NOT_B32 106
+# CHECK: KILL %t212
+# CHECK: %t214:sgpr_32 = S_NOT_B32 107
+# CHECK: KILL %t214
+# CHECK: %t216:sgpr_32 = S_NOT_B32 108
+# CHECK: KILL %t216
+# CHECK: %t218:sgpr_32 = S_NOT_B32 109
+# CHECK: KILL %t218
+# CHECK: %t220:sgpr_32 = S_NOT_B32 110
+# CHECK: KILL %t220
+# CHECK: %t222:sgpr_32 = S_NOT_B32 111
+# CHECK: KILL %t222
+# CHECK: %t224:sgpr_32 = S_NOT_B32 112
+# CHECK: KILL %t224
+# CHECK: %t226:sgpr_32 = S_NOT_B32 113
+# CHECK: KILL %t226
+# CHECK: %t228:sgpr_32 = S_NOT_B32 114
+# CHECK: KILL %t228
+# CHECK: %t230:sgpr_32 = S_NOT_B32 115
+# CHECK: KILL %t230
+# CHECK: %t232:sgpr_32 = S_NOT_B32 116
+# CHECK: KILL %t232
+# CHECK: %t234:sgpr_32 = S_NOT_B32 117
+# CHECK: KILL %t234
+# CHECK: %t236:sgpr_32 = S_NOT_B32 118
+# CHECK: KILL %t236
+# CHECK: %t238:sgpr_32 = S_NOT_B32 119
+# CHECK: KILL %t238
+# CHECK: %t240:sgpr_32 = S_NOT_B32 120
+# CHECK: KILL %t240
+# CHECK: %t242:sgpr_32 = S_NOT_B32 121
+# CHECK: KILL %t242
+# CHECK: %t244:sgpr_32 = S_NOT_B32 122
+# CHECK: KILL %t244
+# CHECK: %t246:sgpr_32 = S_NOT_B32 123
+# CHECK: KILL %t246
+# CHECK: %t248:sgpr_32 = S_NOT_B32 124
+# CHECK: KILL %t248
+# CHECK: %t250:sgpr_32 = S_NOT_B32 125
+# CHECK: KILL %t250
+# CHECK: %t252:sgpr_32 = S_NOT_B32 126
+# CHECK: KILL %t252
+# CHECK: %t254:sgpr_32 = S_NOT_B32 127
+# CHECK: KILL %t254
+
+
+--- |
+  define amdgpu_ps void @main() {
+    ret void
+  }
+...
+---
+name:            main
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; To inflate vgpr count
+    %v0:vreg_1024 = IMPLICIT_DEF
+    %v1:vreg_1024 = IMPLICIT_DEF
+    %v2:vreg_1024 = IMPLICIT_DEF
+    %v3:vreg_1024 = IMPLICIT_DEF
+
+    ; Defs
+    %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc
+    %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc
+    %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc
+    %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc
+    %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc
+    %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc
+    %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc
+    %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc
+    %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc
+    %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc
+    %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc
+    %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc
+    %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc
+    %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc
+    %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc
+    %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc
+    %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc
+    %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc
+    %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc
+    %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc
+    %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc
+    %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc
+    %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc
+    %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc
+    %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc
+    %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc
+    %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc
+    %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc
+    %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc
+    %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc
+    %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc
+    %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc
+    %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc
+    %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc
+    %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc
+    %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc
+    %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc
+    %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc
+    %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc
+    %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc
+    %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc
+    %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc
+    %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc
+    %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc
+    %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc
+    %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc
+    %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc
+    %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc
+    %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc
+    %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc
+    %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc
+    %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc
+    %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc
+    %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc
+    %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc
+    %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc
+    %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc
+    %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc
+    %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc
+    %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc
+    %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc
+    %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc
+    %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc
+    %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc
+    %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc
+    %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc
+    %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc
+    %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc
+    %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc
+    %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc
+    %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc
+    %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc
+    %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc
+    %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc
+    %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc
+    %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc
+    %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc
+    %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc
+    %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc
+    %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc
+    %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc
+    %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc
+    %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc
+    %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc
+    %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc
+    %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc
+    %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc
+    %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc
+    %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc
+    %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc
+    %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc
+    %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc
+    %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc
+    %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc
+    %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc
+    %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc
+    %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc
+    %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc
+    %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc
+    %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc
+    %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc
+    %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc
+    %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc
+    %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc
+    %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc
+    %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc
+    %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc
+    %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc
+    %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc
+    %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc
+    %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc
+    %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc
+    %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc
+    %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc
+    %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc
+    %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc
+    %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc
+    %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc
+    %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc
+    %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc
+    %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc
+    %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc
+    %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc
+    %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc
+    %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc
+    %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc
+    %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc
+    %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc
+
+
+    ; Branch
+    %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    ; Uses
+    KILL %t0
+    KILL %t2
+    KILL %t4
+    KILL %t6
+    KILL %t8
+    KILL %t10
+    KILL %t12
+    KILL %t14
+    KILL %t16
+    KILL %t18
+    KILL %t20
+    KILL %t22
+    KILL %t24
+    KILL %t26
+    KILL %t28
+    KILL %t30
+    KILL %t32
+    KILL %t34
+    KILL %t36
+    KILL %t38
+    KILL %t40
+    KILL %t42
+    KILL %t44
+    KILL %t46
+    KILL %t48
+    KILL %t50
+    KILL %t52
+    KILL %t54
+    KILL %t56
+    KILL %t58
+    KILL %t60
+    KILL %t62
+    KILL %t64
+    KILL %t66
+    KILL %t68
+    KILL %t70
+    KILL %t72
+    KILL %t74
+    KILL %t76
+    KILL %t78
+    KILL %t80
+    KILL %t82
+    KILL %t84
+    KILL %t86
+    KILL %t88
+    KILL %t90
+    KILL %t92
+    KILL %t94
+    KILL %t96
+    KILL %t98
+    KILL %t100
+    KILL %t102
+    KILL %t104
+    KILL %t106
+    KILL %t108
+    KILL %t110
+    KILL %t112
+    KILL %t114
+    KILL %t116
+    KILL %t118
+    KILL %t120
+    KILL %t122
+    KILL %t124
+    KILL %t126
+    KILL %t128
+    KILL %t130
+    KILL %t132
+    KILL %t134
+    KILL %t136
+    KILL %t138
+    KILL %t140
+    KILL %t142
+    KILL %t144
+    KILL %t146
+    KILL %t148
+    KILL %t150
+    KILL %t152
+    KILL %t154
+    KILL %t156
+    KILL %t158
+    KILL %t160
+    KILL %t162
+    KILL %t164
+    KILL %t166
+    KILL %t168
+    KILL %t170
+    KILL %t172
+    KILL %t174
+    KILL %t176
+    KILL %t178
+    KILL %t180
+    KILL %t182
+    KILL %t184
+    KILL %t186
+    KILL %t188
+    KILL %t190
+    KILL %t192
+    KILL %t194
+    KILL %t196
+    KILL %t198
+    KILL %t200
+    KILL %t202
+    KILL %t204
+    KILL %t206
+    KILL %t208
+    KILL %t210
+    KILL %t212
+    KILL %t214
+    KILL %t216
+    KILL %t218
+    KILL %t220
+    KILL %t222
+    KILL %t224
+    KILL %t226
+    KILL %t228
+    KILL %t230
+    KILL %t232
+    KILL %t234
+    KILL %t236
+    KILL %t238
+    KILL %t240
+    KILL %t242
+    KILL %t244
+    KILL %t246
+    KILL %t248
+    KILL %t250
+    KILL %t252
+    KILL %t254
+
+
+
+    ; Some uses to inflate vgpr count
+    KILL %v0
+    KILL %v1
+    KILL %v2
+    KILL %v3
+    S_ENDPGM 0
+...
+    
\ No newline at end of file

>From d755d527434295a157824fe51b0da601778cc14f Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Tue, 6 May 2025 20:18:42 -0700
Subject: [PATCH 11/11] call empty instead of size==0

---
 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 3a0fa5cad4c13..b00d286c938f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -817,7 +817,7 @@ findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT,
   for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
     BBSet.insert(UseMI.getParent());
   }
-  if (BBSet.size() == 0)
+  if (BBSet.empty())
     return nullptr;
 
   MachineBasicBlock *BB = *BBSet.begin();



More information about the llvm-commits mailing list