[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #126331)

Adam Yang via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 7 17:27:49 PST 2025


https://github.com/adam-yang created https://github.com/llvm/llvm-project/pull/126331

None

>From 095dd0ab0c125215781256ac97e1ea790807e222 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Mon, 3 Feb 2025 14:48:43 -0800
Subject: [PATCH 1/3] Added rematerialize pass and test.

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    8 +
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       |   91 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    4 +
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    | 4665 +++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp     | 2241 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h       |  217 +
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.cpp    | 2767 ++++++++++
 .../AMDGPU/AMDGPUMirDivergenceAnalysis.h      |  281 +
 .../AMDGPUMirSyncDependenceAnalysis.cpp       |  511 ++
 .../AMDGPU/AMDGPUMirSyncDependenceAnalysis.h  |   98 +
 .../AMDGPUOccupancyAndLatencyHelper.cpp       |  188 +
 .../AMDGPU/AMDGPUOccupancyAndLatencyHelper.h  |   74 +
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp    | 1790 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h      |  197 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h  |  106 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    6 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |    4 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    3 +
 .../CodeGen/AMDGPU/remat/vector_to_scalar.mir |  405 ++
 20 files changed, 13657 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..4a4d7756ae9ac71 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,6 +430,14 @@ class TargetRegisterInfo : public MCRegisterInfo {
                                 LaneBitmask LaneMask,
                                 SmallVectorImpl<unsigned> &Indexes) const;
 
+  /// Return the set of sub register indexes that minimally cover the given
+  /// lane mask for the given register class.
+  ///
+  /// \returns an empty set if there is no set of covering sub registers.
+  std::vector<unsigned>
+  getMinimalSpanningSubRegIdxSetForLaneMask(const TargetRegisterClass *RC,
+                                            LaneBitmask mask) const;
+
   /// The lane masks returned by getSubRegIndexLaneMask() above can only be
   /// used to determine if sub-registers overlap - they can't be used to
   /// determine if a set of sub-registers completely cover another
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 77a4c74f1b38b9d..d37796a82899a1a 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -719,3 +719,94 @@ void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
   dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
 }
 #endif
+
+std::vector<unsigned>
+TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
+    const TargetRegisterClass *RC, LaneBitmask mask) const {
+  // TODO: this could replace the code it was copied from in SplitKit.cpp
+
+  // First pass: Try to find a perfectly matching subregister index.
+  // If none exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  const LaneBitmask avoid = ~mask;
+  {
+    unsigned BestCover = 0;
+    for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+      // Is this index even compatible with the given class?
+      if (getSubClassWithSubReg(RC, Idx) != RC)
+        continue;
+      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // The index must not cover any lanes outside
+      if ((SubRegMask & avoid).any())
+        continue;
+
+      unsigned PopCount = SubRegMask.getNumLanes();
+      PossibleIndexes.push_back(Idx);
+      if (PopCount > BestCover) {
+        BestCover = PopCount;
+        BestIdx = Idx;
+      }
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0) {
+    LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                      << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                      << '\n');
+    assert(false && "Impossible to span reg class");
+    return std::vector<unsigned>();
+  }
+
+  std::vector<unsigned> result;
+  result.push_back(BestIdx);
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  mask &= ~(getSubRegIndexLaneMask(BestIdx));
+  while (mask.any()) {
+    BestIdx = 0;
+    int BestCover = std::numeric_limits<int>::min();
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == mask) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Guaranteed above
+      assert((SubRegMask & avoid).none());
+
+      // Try to cover as much of the remaining lanes as possible but as few of
+      // the already covered lanes as possible.
+      int Cover = (SubRegMask & mask).getNumLanes() -
+                  (SubRegMask & ~mask).getNumLanes();
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0) {
+      LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
+                        << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                        << '\n');
+      assert(false && "Impossible to span reg class");
+      return std::vector<unsigned>();
+    }
+
+    result.push_back(BestIdx);
+    mask &= ~getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return result;
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 31656c98ccd36fa..0f5b4f2277d1a8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -466,6 +466,10 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &);
+FunctionPass *createAMDGPUHotBlockRematerializePass();
+extern char &AMDGPUHotBlockRematerializeID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
new file mode 100644
index 000000000000000..44ebaa2d51bec19
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -0,0 +1,4665 @@
+//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU hot block Rematerialize
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUSubExpDag.h"
+#include "AMDGPUVMemDegreeDAG.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "GCNRegPressure.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUMIRUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#include <unordered_set>
+#define DEBUG_TYPE "amdgpu-hot-block-remat"
+
+using namespace llvm;
+
+static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
+static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
+static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
+static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
+static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+
+namespace {
+typedef DenseSet<MachineInstr *> InstSet;
+typedef DenseSet<MachineBasicBlock *> BlockSet;
+template<typename T>
+using BlockMap = MapVector<MachineBasicBlock *, T>;
+
+// Rematerialize in a single pass instead of doing in register allcation.
+// If in register allocation, fail to rematerialize will cause spill.
+class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
+
+public:
+  static char ID;
+
+  DenseSet<const MachineInstr*> TotalUniformInsts;
+  DenseSet<const MachineInstr*> SafeToRemoveInsts;
+  DenseSet<const MachineInstr*> DivergentInsts;
+  void RemoveInst(const MachineInstr *MI) {
+    TotalUniformInsts.erase(MI);
+    SafeToRemoveInsts.erase(MI);
+    DivergentInsts.erase(MI);
+  }
+
+  AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "AMDGPU rematerialize"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+    AU.addRequired<SlotIndexesWrapperPass>();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+typedef AMDGPUHotBlockRematerialize Remat;
+
+} // end anonymous namespace
+
+// Util functions.
+namespace {
+
+MachineBasicBlock *
+nearest_common_dominator(MachineDominatorTree *DT,
+                         BlockSet &Blocks) {
+  auto I = Blocks.begin(), E = Blocks.end();
+
+  MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
+  while (I != E) {
+    MachineBasicBlock *B = cast<MachineBasicBlock>(*(I++));
+    DomB = DT->findNearestCommonDominator(DomB, B);
+    if (DomB == nullptr)
+      return nullptr;
+  }
+  // For split block like:
+  // bb.42:
+  //    %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec,
+  //    //    implicit $exec
+  //  %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  //  implicitdef $scc, implicit $exec
+  //
+  // bb.68:
+  //; predecessors: %bb.42
+  //  successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%),
+  //  %bb.43(50.00%)
+  //
+  //  SI_MASK_BRANCH %bb.43, implicit $exec
+  //  S_BRANCH %bb.45
+  // which is from
+  // bb.42:
+  //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit
+  //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec,
+  // SI_MASK_BRANCH %bb.43, implicit $exec
+  // S_BRANCH %bb.45
+  // The real common dom is bb.42.
+  // TODO: use _term version of exec update instructions so don't need this
+  // anymore.
+  if (DomB && DomB->pred_size() == 1 && !DomB->empty()) {
+    // Upstreaming note: This used to be SI_MASK_BRANCH
+    if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) {
+      MachineBasicBlock *Pred = *DomB->pred_begin();
+      if (Pred->succ_size() == 1 &&
+          (Pred->empty() || !Pred->back().isBranch())) {
+        DomB = Pred;
+      }
+    }
+  }
+
+  return DomB;
+}
+
+MachineBasicBlock *find_non_loop_dominator(MachineBasicBlock *BB,
+                                           MachineDominatorTree *DT,
+                                           MachineLoopInfo *LI) {
+  while (LI->getLoopDepth(BB) > 0) {
+    MachineDomTreeNode *N = DT->getNode(BB);
+    if (N == nullptr)
+      return nullptr;
+    MachineDomTreeNode *IDom = N->getIDom();
+    if (IDom == nullptr)
+      return nullptr;
+
+    BB = IDom->getBlock();
+  }
+
+  return BB;
+}
+
+MachineBasicBlock *
+FindInsertBlock(MachineInstr &DefMI, unsigned Reg, MachineDominatorTree *DT,
+                MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                const MachineRegisterInfo &MRI, bool bMemBound) {
+
+  BlockSet BBSet;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    BBSet.insert(UseMI.getParent());
+  }
+  if (BBSet.size() == 0)
+    return nullptr;
+
+  MachineBasicBlock *BB = *BBSet.begin();
+  if (BBSet.size() > 1) {
+    MachineBasicBlock *BDom = nearest_common_dominator(DT, BBSet);
+    if (!BDom)
+      return nullptr;
+    BB = BDom;
+  }
+  // Try to find non loop dominator.
+  if (!bMemBound) {
+    BB = find_non_loop_dominator(BB, DT, MLI);
+  }
+  if (!BB)
+    return nullptr;
+
+  // If BB is already a hot block, move to BB will not help.
+  // hotBlockRemat will fail it when process BB.
+
+  // Must reachable from DefMI.
+  if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB))
+    return nullptr;
+
+  return BB;
+}
+
+bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
+  unsigned OpNum = DefMI->getNumOperands();
+
+  // Only move DefMI which all operand is unique def.
+  for (unsigned i = 0; i < OpNum; i++) {
+    MachineOperand &Op = DefMI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    if (!MRI.getUniqueVRegDef(Op.getReg()) &&
+        !llvm::IsSub0Sub1SingleDef(Op.getReg(), MRI)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+// SGPR has alignment requirment, cannot get accurate reg number.
+const unsigned NearTargetRegLimit = 10;
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+  unsigned maxSGPR = ST->getAddressableNumSGPRs();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg)
+    maxSGPR -= 4;
+
+  const unsigned AlignmentDelta = 3;
+  maxSGPR -= AlignmentDelta;
+
+  return maxSPressure > maxSGPR;
+}
+
+struct RematStatus {
+  unsigned TargetOcc;
+  unsigned TargetVLimit;
+  unsigned TargetSLimit;
+  unsigned MaxVPressure;
+  unsigned MaxSPressure;
+  unsigned InputPhysicalVPressure;
+  unsigned InputPhysicalSPressure;
+  // More occupancy can help more than latency cost to reach it.
+  bool bMemBound;
+  // abs(VTargetOcc-STargetOcc) > 1.
+  bool bNotBalance;
+  DenseMap<MachineBasicBlock *, GCNRegPressure> MBBPressureMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBInputLiveMap;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBOutputLiveMap;
+  // Collect MBBs which has memory write. When move instructions cross MBB, skip
+  // mem inst if the MBB has memory write. To make things fast, just check
+  // mayStore and isBarrier.
+  DenseSet<MachineBasicBlock *> MemWriteMBBSet;
+};
+
+unsigned CollectMBBPressure(
+    MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+    RematStatus &status) {
+  // Skip processing current block if it has only debug instructions
+  if (MBB.getFirstNonDebugInstr() == MBB.end())
+    return ST->getOccupancyWithNumVGPRs(0);
+  auto BBEnd = MBB.rbegin();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  if (!llvm::GetNonDebugMBBEnd(BBEnd, MBB))
+    return ST->getOccupancyWithNumVGPRs(0);
+
+  GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+  RPTracker.reset(*BBEnd, &outputLive, true);
+
+  for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) {
+    MachineInstr &MI = (*I++);
+    RPTracker.recede(MI);
+    if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH))
+      status.MemWriteMBBSet.insert(&MBB);
+  }
+
+  GCNRegPressure RP = RPTracker.getMaxPressureAndReset();
+  unsigned sPressure = RP.getMaxSGPR();
+  if (sPressure > maxSPressure) {
+    maxSPressure = sPressure;
+  }
+  if (RP.getVGPRNum(ST->hasGFX90AInsts()) > maxVPressure) {
+    maxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+  }
+  status.MBBPressureMap[&MBB] = RP;
+  return RP.getOccupancy(*ST);
+}
+
+unsigned CollectFnPressure(
+    MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
+    RematStatus &status) {
+  unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+  // If only have one block, input/ouput virtual live set are empty.
+  if (MF.size() > 1) {
+    // Build input output live reg first.
+    auto *SlotIndexes = LIS->getSlotIndexes();
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBInputSlotMap;
+    DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+    for (MachineBasicBlock &MBB : MF) {
+      auto BBBegin = MBB.getFirstNonDebugInstr();
+      if (BBBegin != MBB.end()) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBBegin);
+        MBBInputSlotMap[&MBB] = SI;
+      }
+
+      auto BBEnd = MBB.rbegin();
+
+      // R.End doesn't point to the boundary instruction.
+      // Skip Debug instr.
+      if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+        auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+        MBBOutputSlotMap[&MBB] = SI;
+      }
+    }
+
+    for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+      auto Reg = Register::index2VirtReg(I);
+      if (!LIS->hasInterval(Reg))
+        continue;
+
+      LaneBitmask LiveMask;
+      const auto &LI = LIS->getInterval(Reg);
+
+      // Skip local live interval to make live input/ouput faster.
+      if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+        continue;
+
+      for (auto inputIt : MBBInputSlotMap) {
+        MachineBasicBlock *MBB = inputIt.first;
+        auto SI = inputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          status.MBBInputLiveMap[MBB][Reg] |= LiveMask;
+      }
+
+      for (auto outputIt : MBBOutputSlotMap) {
+        MachineBasicBlock *MBB = outputIt.first;
+        auto SI = outputIt.second;
+
+        auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+        if (LiveMask.any())
+          status.MBBOutputLiveMap[MBB][Reg] |= LiveMask;
+      }
+    }
+  }
+
+  LLVM_DEBUG(
+      const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+      dbgs() << "output live"; for (auto &it
+                                    : status.MBBOutputLiveMap) {
+        unsigned Idx = it.first->getNumber();
+        auto LiveReg = it.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      } dbgs() << "input live";
+      for (auto &it
+           : status.MBBInputLiveMap) {
+        unsigned Idx = it.first->getNumber();
+        auto LiveReg = it.second;
+        dbgs() << "MBB" << Idx << ":";
+        llvm::dumpLiveSet(LiveReg, SIRI);
+      });
+
+  for (auto it = MF.begin(); it != MF.end(); ++it) {
+    MachineBasicBlock &MBB = *it;
+    unsigned Occ = CollectMBBPressure(MBB, LIS, MRI, ST, maxVPressure,
+                                      maxSPressure, status);
+    if (TgtOcc > Occ)
+      TgtOcc = Occ;
+  }
+  return TgtOcc;
+}
+RematStatus
+GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+               const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+  unsigned maxSPressure = 0;
+  unsigned maxVPressure = 0;
+  RematStatus status;
+  unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
+                                      maxSPressure, status);
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (TgtOcc >= MaxOcc) {
+    status.TargetOcc = TgtOcc;
+    status.TargetVLimit = 0;
+    status.TargetSLimit = 0;
+    status.MaxVPressure = 0;
+    status.MaxSPressure = 0;
+    status.InputPhysicalVPressure = 0;
+    status.InputPhysicalSPressure = 0;
+    status.bMemBound = false;
+    status.bNotBalance = false;
+    return status;
+  }
+
+  maxSPressure += RegForVCC;
+  maxVPressure = std::min(maxVPressure, ST->getMaxNumVGPRs(MF));
+  unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(maxSPressure);
+  unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(maxVPressure);
+
+  llvm::SchedScore totalScore = llvm::CollectLatency(MF, *ST, MLI);
+  bool bMemBound =
+      totalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc);
+
+  bool bNotBalance = false;
+
+  const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU();
+  // Currently, only sgpr bound can be fixed with remat.
+  if (STgtOcc < VTgtOcc) {
+    unsigned bigOcc = std::max(STgtOcc, VTgtOcc);
+    // Change TgtOcc to bigOcc in case sgpr and vgpr is not balance.
+    if (bigOcc > TgtOcc) {
+      TgtOcc = bigOcc;
+      bNotBalance = true;
+      if (TgtOcc >= MaxOccupancy)
+        TgtOcc = MaxOccupancy-1;
+    }
+  }
+
+  // Collect input physical pressure.
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  unsigned vInputPressure = 0;
+  uint64_t sInputMask = 0;
+  for (const auto &livein : MRI.liveins()) {
+    const Register Reg = livein.first;
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    assert(Reg.isPhysical() && "input must be physical reg");
+    unsigned RegSize = RC->getLaneMask().getNumLanes();
+    if (SIRI->isVGPR(MRI, Reg)) {
+      vInputPressure += RegSize;
+    } else {
+      unsigned RegIndex = SIRI->getHWRegIndex(Reg);
+      uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+      sInputMask |= mask;
+    }
+  }
+  // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high
+  // pressure.
+  unsigned sInputPressure = 0;
+  uint64_t mask = 0xf;
+  while (mask != 0) {
+    if (mask & sInputMask) {
+      sInputPressure += 4;
+    }
+    mask = mask << 4;
+  }
+
+
+  // If balanced, try next occupancy.
+  TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
+
+  auto CC = MF.getFunction().getCallingConv();
+  bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS;
+  // For shader profiles other than ps/cs, set target profile max as 4.
+  if (!IsPsCs) {
+    TgtOcc = TgtOcc > 4 ? 4 : TgtOcc;
+  }
+  if (TargetOccupancy)
+    TgtOcc = TargetOccupancy;
+
+  unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true);
+  unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc);
+
+  status.TargetOcc = TgtOcc;
+  status.TargetVLimit = VLimit;
+  status.TargetSLimit = SLimit;
+  status.MaxVPressure = maxVPressure;
+  status.MaxSPressure = maxSPressure;
+  status.InputPhysicalVPressure = vInputPressure;
+  status.InputPhysicalSPressure = sInputPressure;
+  status.bMemBound = bMemBound;
+  status.bNotBalance = bNotBalance;
+  return status;
+}
+
+} // namespace
+
+// Remat.
+namespace {
+
+struct RematNode {
+  enum class RematKind {
+    Candidate, // Not ready yet.
+    OneDefOneUse,
+    Clone,
+  };
+  RematNode()
+      : Reg(0), DefMI(nullptr), Kind(RematKind::Candidate),
+        InsertPointMI(nullptr), InsertBlock(nullptr), Size(0) {}
+  RematNode(unsigned R, MachineInstr *MI, unsigned S)
+      : Reg(R), DefMI(MI), Kind(RematKind::Candidate), InsertPointMI(nullptr),
+        InsertBlock(nullptr), Size(S) {}
+  RematNode(const RematNode &N)
+      : Reg(N.Reg), DefMI(N.DefMI), Kind(N.Kind),
+        InsertPointMI(N.InsertPointMI), InsertBlock(N.InsertBlock),
+        Size(N.Size) {}
+  unsigned Reg;
+  MachineInstr *DefMI;
+  MachineBasicBlock *InsertBlock;
+  union {
+    MachineInstr *InsertPointMI;
+    unsigned UserCount;
+  };
+  RematKind Kind;
+  unsigned Size;
+};
+
+struct BlockLiveInfo {
+  MachineBasicBlock *BB;
+  unsigned maxSReg;
+  unsigned maxVReg;
+  // Input live is the live reg which cross block.
+  const GCNRPTracker::LiveRegSet inputLive;
+};
+
+// Skip live reg remated to other block.
+void UpdateLiveInfo(MapVector<unsigned, RematNode> &RematMap,
+                    GCNRPTracker::LiveRegSet &LiveSet,
+                    const GCNRPTracker::LiveRegSet &inputLive,
+                    MachineBasicBlock *CurBB,
+                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  for (auto &it : RematMap) {
+    unsigned Reg = it.first;
+    // Skip reg not in live set.
+    if (!LiveSet.count(Reg))
+      continue;
+    // Skip reg already in input set.
+    // Input set will be taken care in GetReducedSize.
+    if (inputLive.count(Reg))
+      continue;
+
+    auto &Node = it.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is after InsertBB in Reverse post order, the def is
+      // still before LiveInfo.BB, it is still live.
+      unsigned LiveBBIndex = RPOTIndexMap[CurBB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex > InsertBBIndex) {
+        continue;
+      }
+    }
+    // Already in remat map, don't need to check again, remove from
+    // candidate.
+    LiveSet.erase(Reg);
+  }
+}
+
+int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
+
+  // Find shared operand in ReducedInsts.
+  int SharedSize = 0;
+  DenseMap<unsigned, LaneBitmask> SharedRegMaskMap;
+  for (MachineInstr *DefMI : ReducedInsts) {
+    for (MachineOperand &MO : DefMI->operands()) {
+      if (MO.isImm())
+        continue;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      if (MO.isTied())
+        continue;
+      Register Reg = MO.getReg();
+
+      if (Reg == AMDGPU::EXEC)
+        continue;
+      if (!Reg.isVirtual())
+        continue;
+
+      bool isVGPR = SIRI->isVGPR(MRI, MO.getReg());
+      if (bVGPR != isVGPR) {
+        // Not support mix of v and s when remat now.
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+      unsigned Mask;
+      if (unsigned SubIdx = MO.getSubReg()) {
+        OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+        int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5;
+        Mask = (1 << SubMOSize) - 1;
+      } else {
+        Mask = (1 << MOSize) - 1;
+      }
+      auto SharedRegIt = SharedRegMaskMap.find(Reg);
+      if (SharedRegIt == SharedRegMaskMap.end()) {
+        SharedRegMaskMap[Reg] = LaneBitmask(Mask);
+      } else {
+        unsigned PrevMask = SharedRegIt->second.getAsInteger();
+        if (unsigned SharedMask = (PrevMask & Mask)) {
+          // Some thing is shared.
+          for (int i = 0; i < MOSize; i++) {
+            if (SharedMask & (1 << i)) {
+              SharedSize += 1;
+            }
+          }
+        }
+        LaneBitmask MoMask = LaneBitmask(Mask | PrevMask);
+        SharedRegMaskMap[Reg] = MoMask;
+      }
+    }
+  }
+  return SharedSize;
+}
+
+int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
+                   GCNRPTracker::LiveRegSet &CanidateSet,
+                   InstSet &ReducedInsts,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   BlockLiveInfo &LiveInfo,
+                   DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
+  int ReducedSize = 0;
+  for (auto &it : RematMap) {
+    unsigned Reg = it.first;
+
+    if (!CanidateSet.count(Reg))
+      continue;
+
+    bool bReduced = false;
+    auto &Node = it.second;
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      MachineBasicBlock *InsertBB = Node.InsertBlock;
+      // If LiveInfo.BB is before InsertBB in Reverse post order, the def is
+      // moved after LiveInfo.BB, it is not live anymore.
+      unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB];
+      unsigned InsertBBIndex = RPOTIndexMap[InsertBB];
+      if (LiveBBIndex < InsertBBIndex)
+        bReduced = true;
+    } else {
+      // Clone.
+      bReduced = true;
+      // If has use in LiveInfo.BB, could not reduce from input live.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        if (UseMI.getParent() == LiveInfo.BB) {
+          bReduced = false;
+          break;
+        }
+      }
+    }
+    if (bReduced) {
+      ReducedSize += Node.Size;
+      ReducedInsts.insert(Node.DefMI);
+    }
+
+    // Already in remat map, don't need to check again, remove from candidate.
+    CanidateSet.erase(Reg);
+  }
+
+  return ReducedSize;
+}
+
+int RematGain(MachineInstr *DefMI, unsigned Reg,
+              GCNRPTracker::LiveRegSet &CandidateRegSet,
+              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+              bool bVGPR) {
+  int rematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  for (MachineOperand &MO : DefMI->operands()) {
+    if (MO.isImm())
+      continue;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+    if (MO.isTied())
+      continue;
+
+    if (MO.getReg() == AMDGPU::EXEC)
+      continue;
+
+    // Don't move user of VCC.
+    if (MO.getReg() == AMDGPU::VCC) {
+      rematSize = 0;
+      break;
+    }
+    Register Reg = MO.getReg();
+
+    // Don't move physical register use.
+    if (Reg.isPhysical()) {
+      rematSize = 0;
+      break;
+    }
+
+    bool isVGPR = SIRI->isVGPR(MRI, Reg);
+    if (bVGPR != isVGPR) {
+      // Not support mix of v and s when remat now.
+      // TODO: count possible pressure change here.
+      rematSize = 0;
+      break;
+    }
+    bool bSingleDef = MRI.hasOneDef(Reg);
+    if (!bSingleDef) {
+      bSingleDef = llvm::IsSub0Sub1SingleDef(Reg, MRI);
+    }
+
+    if (bSingleDef) {
+      // The reg might share with other candidates, but not check it here.
+      // Count share reg in GetReducedSize.
+      if (EnableAggressive) {
+        // In case of aggressive remat, treat multi use reg as shared reg and
+        // ignore size of shared reg.
+        if (!MRI.hasOneNonDBGUse(Reg))
+          continue;
+      }
+      const TargetRegisterClass *OpRC = MRI.getRegClass(Reg);
+      if (unsigned SubIdx = MO.getSubReg()) {
+        if (OpRC)
+          OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx);
+      }
+      int inputSize = SIRI->getRegSizeInBits(*OpRC);
+      // If input not live in hotspot, move it cross hotspot should have
+      // less reg then DefMi.
+      if (rematSize > inputSize) {
+        rematSize -= inputSize;
+        continue;
+      }
+    }
+
+    rematSize = 0;
+    break;
+  }
+  return rematSize;
+}
+
+void BuildRematCandiates(std::vector<RematNode> &Candidates,
+                         GCNRPTracker::LiveRegSet &CandidateRegSet,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         const MachineRegisterInfo &MRI,
+                         const SIInstrInfo *SIII, const SIRegisterInfo *SIRI,
+                         bool bVGPR) {
+
+  for (auto liveRegIt : CandidateRegSet) {
+    unsigned Reg = liveRegIt.first;
+    // Skip unsafe reg.
+    if (PinnedRegSet.count(Reg))
+      continue;
+
+    bool isVGPR = SIRI->isVGPR(MRI, Reg);
+    if (isVGPR != bVGPR)
+      continue;
+    bool bSafeCandidate = true;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    if (MI) {
+      if (bVGPR) {
+        // Only remat valu now.
+        if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY)
+          bSafeCandidate = false;
+        if (MI->getOpcode() == AMDGPU::COPY) {
+          // Make sure src is unique define.
+          if (MI->getOperand(1).isReg() &&
+              nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg()))
+            bSafeCandidate = false;
+        } else {
+          // Skip convergent valu.
+          if (MI->isConvergent())
+            bSafeCandidate = false;
+        }
+      }
+      // Skip inst has more than 1 def.
+      if (MI->getDesc().NumDefs > 1)
+        bSafeCandidate = false;
+    } else {
+      bSafeCandidate = false;
+    }
+
+    if (bSafeCandidate) {
+      int gain = RematGain(MI, Reg, CandidateRegSet, MRI, SIRI, bVGPR);
+      if (gain > 0) {
+        Candidates.emplace_back(RematNode(Reg, MI, gain >> 5));
+      } else {
+        bSafeCandidate = false;
+      }
+    }
+    // Save unsafe reg.
+    if (!bSafeCandidate)
+      PinnedRegSet.insert(Reg);
+  }
+
+  // Sort by gain.
+  std::sort(Candidates.begin(), Candidates.end(),
+            [](RematNode &i, RematNode &j) { return i.Size > j.Size; });
+}
+
+// For case like
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
+//  %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+// Sink S_AND right before S_CSELECT will overwrite SCC.
+// To avoid it, skip case when DefMI and UseMI has implicit define use.
+bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
+  if (DefMI->getDesc().NumImplicitDefs == 0)
+    return false;
+
+  auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo();
+  for (MachineOperand &def : DefMI->implicit_operands()) {
+    if (!def.isReg())
+      continue;
+    if (def.isUse())
+      continue;
+    unsigned Reg = def.getReg();
+    if (UseMI->readsRegister(Reg, TRI))
+      return true;
+  }
+  return false;
+}
+
+void AddOneDefOneUseCandidate(RematNode &Node,
+                              std::vector<RematNode> &RematList,
+                              MachineRegisterInfo &MRI, int &rematCnt,
+                              MachineDominatorTree *DT,
+                              MachinePostDominatorTree *PDT,
+                              MachineLoopInfo *MLI, bool bVGPR,
+                              bool bMemBound) {
+  unsigned Reg = Node.Reg;
+  MachineInstr *DefMI = Node.DefMI;
+
+  unsigned size = Node.Size;
+  MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin();
+  MachineBasicBlock *InsertBB = UseMI->getParent();
+
+  // For VGPR, always move next to the only user to avoid wqm or exec issue.
+  // But doing this will cause issue when DefMI is in wqm but single user not in
+  // wqm. Disable VGPR remat for now.
+  // TODO: make sure single user don't need wqm.
+  if (!bVGPR) {
+    if (MachineBasicBlock *NewInsertBB =
+            FindInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, bMemBound)) {
+      if (InsertBB != NewInsertBB) {
+        InsertBB = NewInsertBB;
+        // If can find a non-loop insert block, go to the insert block.
+        if (DefMI->getParent() != InsertBB) {
+          if (!InsertBB->empty()) {
+            auto it = InsertBB->getFirstNonPHI();
+            it = skipDebugInstructionsForward(it, InsertBB->end());
+            if (it == InsertBB->end())
+              UseMI = nullptr;
+            else
+              UseMI = &*it;
+          }
+        }
+      }
+    }
+  }
+
+  if (bVGPR) {
+    // Don't count reg in same block for valu.
+    if (UseMI->getParent() == DefMI->getParent())
+      return;
+  }
+
+  // Skip case when DefMI has implicit define which used by UseMI.
+  if (isImplicitDefUse(DefMI, UseMI)) {
+    return;
+  }
+
+  Node.InsertBlock = InsertBB;
+  Node.InsertPointMI = UseMI;
+  Node.Kind = RematNode::RematKind::OneDefOneUse;
+  RematList.emplace_back(Node);
+  rematCnt += size;
+}
+
+void AddCloneCandidate(std::vector<RematNode *> &cloneList,
+                       std::vector<RematNode> &RematList,
+                       DenseSet<unsigned> &PinnedRegSet,
+                       MachineRegisterInfo &MRI, int &rematCnt,
+                       SlotIndexes *SlotIndexes, MachineFunction &MF) {
+  // Group user in same blocks.
+  std::vector<BlockSet> UserSetList(cloneList.size());
+
+  for (int i = 0; i < cloneList.size(); i++) {
+    auto *Node = cloneList[i];
+    unsigned Reg = Node->Reg;
+    MachineInstr *DefMI = Node->DefMI;
+    // Group user in same blocks.
+    BlockSet &UserSet = UserSetList[i];
+
+    for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+         useIt != MRI.use_instr_nodbg_end();) {
+      MachineInstr &UseMI = *(useIt++);
+      UserSet.insert(UseMI.getParent());
+    }
+
+    if (UserSet.size() == 1) {
+      // All users are in same block with DefMI.
+      if (*UserSet.begin() == DefMI->getParent()) {
+        // Mark cannot remat for now.
+        // TODO: try to split if is bigger than 4 and only used once per
+        // channel.
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+    }
+
+    int size = Node->Size;
+    size <<= 16;
+    // Pack userSet size to size.
+    size |= UserSet.size();
+    Node->UserCount = size;
+  }
+
+  std::sort(cloneList.begin(), cloneList.end(),
+            // Sort based on userSet size.
+            [](const RematNode *a, const RematNode *b) {
+              static constexpr int mask = 0xffff;
+              return (a->UserCount & mask) < (b->UserCount & mask);
+            });
+
+  for (RematNode *Node : cloneList) {
+    Node->Kind = RematNode::RematKind::Clone;
+    RematList.emplace_back(*Node);
+    rematCnt += Node->Size;
+  }
+}
+
+int FilterRematCandiates(std::vector<RematNode> &Candidates,
+                         std::vector<RematNode> &RematList,
+                         DenseSet<unsigned> &PinnedRegSet,
+                         MachineDominatorTree *DT,
+                         MachinePostDominatorTree *PDT, MachineLoopInfo *MLI,
+                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                         MachineFunction &MF, SlotIndexes *SlotIndexes,
+                         bool bVGPR, bool bMemBound) {
+  int rematCnt = 0;
+  // Work one def one use first.
+  for (auto &Node : Candidates) {
+    unsigned Reg = Node.Reg;
+    if (!MRI.hasOneNonDBGUse(Reg)) {
+      continue;
+    }
+    MachineInstr *DefMI = Node.DefMI;
+    if (!IsSafeToMove(DefMI, MRI)) {
+      PinnedRegSet.insert(Reg);
+      continue;
+    }
+
+    AddOneDefOneUseCandidate(Node, RematList, MRI, rematCnt, DT, PDT, MLI,
+                             bVGPR, bMemBound);
+  }
+
+  if (!bVGPR) {
+    std::vector<RematNode *> cloneList;
+    // Try multi use case.
+    for (auto &Node : Candidates) {
+      unsigned Reg = Node.Reg;
+      if (MRI.hasOneNonDBGUse(Reg)) {
+        continue;
+      }
+      MachineInstr *DefMI = Node.DefMI;
+      if (!IsSafeToMove(DefMI, MRI)) {
+        PinnedRegSet.insert(Reg);
+        continue;
+      }
+
+      // Clone for each user.
+      cloneList.emplace_back(&Node);
+    }
+
+    AddCloneCandidate(cloneList, RematList, PinnedRegSet, MRI, rematCnt,
+                      SlotIndexes, MF);
+  }
+
+  return rematCnt;
+}
+
+void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
+                SmallVector<MachineInstr *, 2> &userMIs) {
+  for (MachineInstr *UseMI : userMIs) {
+    for (MachineOperand &MO : UseMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == Reg) {
+        MO.setReg(NewReg);
+        if (bSubRegDef)
+          MO.setSubReg(0);
+      }
+    }
+  }
+}
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    unsigned Reg, BlockMap<SmallVector<MachineInstr *, 2>> &userBlocks,
+    DenseSet<MachineBasicBlock *> &UserMBBSet,
+    std::vector<BlockLiveInfo> &hotBlocks, MachineDominatorTree *pDT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> hotBlockSet;
+  for (BlockLiveInfo &hotBlock : hotBlocks) {
+    if (hotBlock.inputLive.count(Reg)) {
+      hotBlockSet.insert(hotBlock.BB);
+    }
+  }
+
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+  for (MachineBasicBlock *MBB : UserMBBSet) {
+    // Always clone in hot block.
+    if (hotBlockSet.count(MBB))
+      continue;
+
+    bool bDomAllHotBlocks = true;
+    bool bDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *hotMBB : hotBlockSet) {
+      if (!pDT->dominates(MBB, hotMBB)) {
+        bDomAllHotBlocks = false;
+      }
+      if (!pDT->dominates(hotMBB, MBB)) {
+        bDomedByAllHotBlocks = false;
+      }
+      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+        break;
+      }
+    }
+    if (bDomAllHotBlocks) {
+      userBlocks.erase(MBB);
+    } else if (bDomedByAllHotBlocks) {
+      afterHotRangeMBBs.insert(MBB);
+    }
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!afterHotRangeMBBs.empty()) {
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      for (auto it2 : afterHotRangeMBBs) {
+        MachineBasicBlock *MBB2 = it2;
+        if (MBB == MBB2)
+          continue;
+        if (pDT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *domedMBB : Dom) {
+        // Remove domedMBB.
+        DomMap.erase(domedMBB);
+        UserMBBSet.erase(domedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+// Look for an earlier insert point if the InstructionToMove
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
+    MachineInstr *InstructionToMove,
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint,
+    MachineRegisterInfo &MRI,
+   const SIRegisterInfo *SIRI,
+   const SIInstrInfo *SIII
+) 
+{
+    const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+    if (WillSmashScc)
+    {
+        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+            CurrentInsertPoint,
+            SIRI,
+            SIII,
+            &MRI
+        );
+    }
+
+    return CurrentInsertPoint;
+}
+
+// Look for an earlier insert point if the SubExp
+// writes to scc and scc is live at the CurrentInsertPoint.
+static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
+    const SubExp &SubExpToMove,
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint,
+    MachineRegisterInfo& MRI,
+    const SIRegisterInfo* SIRI,
+    const SIInstrInfo* SIII
+)
+{
+    const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+    if (WillSmashScc)
+    {
+        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
+            CurrentInsertPoint,
+            SIRI,
+            SIII,
+            &MRI
+        );
+    }
+
+    return CurrentInsertPoint;
+}
+
+// Return trun if moving MI to Location will smash a live scc value.
+static bool WillSmashSccAtLocation(
+    MachineInstr* MI,
+    MachineBasicBlock* MBB,
+    MachineBasicBlock::iterator Location
+)
+{
+    // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+    // SCC has no subreg/suprereg relationships.
+    return MI->modifiesRegister(AMDGPU::SCC, nullptr)
+        && llvm::IsSccLiveAt(MBB, Location);
+}
+
+void ApplyCloneRemat(Remat *Remat,
+                     RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+                     MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                     SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, MachineFunction &MF) {
+  unsigned Reg = Node.Reg;
+
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+  auto DefOp = DefMI->getOperand(0);
+  const MCInstrDesc &Desc = DefMI->getDesc();
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  // When the unique def has subReg, just create newReg for the subReg part.
+  bool bSubRegDef = false;
+  if (DefOp.getSubReg() != 0) {
+    RC = SIRI->getSubRegisterClass(RC, DefOp.getSubReg());
+    bSubRegDef = true;
+  }
+  const DebugLoc DL = DefMI->getDebugLoc();
+  unsigned OpNum = DefMI->getNumOperands();
+
+  Node.Kind = RematNode::RematKind::Clone;
+
+  // Group user in same blocks.
+  BlockMap<SmallVector<MachineInstr *, 2>> UserMap;
+  DenseSet<MachineBasicBlock *> UserMBBSet;
+  for (auto useIt = MRI.use_instr_nodbg_begin(Reg);
+       useIt != MRI.use_instr_nodbg_end();) {
+    MachineInstr &UseMI = *(useIt++);
+    UserMap[UseMI.getParent()].emplace_back(&UseMI);
+    UserMBBSet.insert(UseMI.getParent());
+  }
+
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Reg, UserMap, UserMBBSet, hotBlocks, pDT);
+
+  for (auto useIt : UserMap) {
+    MachineBasicBlock *MBB = useIt.first;
+    // Skip same block uses.
+    if (MBB == DefMI->getParent()) {
+      continue;
+    }
+    // Skip MBB which share clone from other MBBs.
+    if (UserMBBSet.count(MBB) == 0)
+      continue;
+
+    unsigned NewReg = MRI.createVirtualRegister(RC);
+    auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg);
+    for (unsigned i = 1; i < OpNum; i++) {
+      NewDef = NewDef.add(DefMI->getOperand(i));
+    }
+
+    MachineInstr *InsertPointMI = useIt.second.front();
+    SlotIndex lastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI);
+
+    for (MachineInstr *UseMI : useIt.second) {
+      SlotIndex slot = SlotIndexes->getInstructionIndex(*UseMI);
+      if (lastSlot > slot) {
+        lastSlot = slot;
+        InsertPointMI = UseMI;
+      }
+    }
+    
+    MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
+    );
+
+    for (MachineMemOperand *MO : DefMI->memoperands()) {
+      NewDef->addMemOperand(MF, MO);
+    }
+
+    MBB->insert(InsertPoint, NewDef);
+
+    SlotIndexes->insertMachineInstrInMaps(*NewDef);
+
+    SmallVector<MachineInstr *, 2> &userMIs = useIt.second;
+    updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+
+    // update users in dom MBBs.
+    auto domMapIt = DomMap.find(MBB);
+    if (domMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+        SmallVector<MachineInstr *, 2> &userMIs = UserMap[UpdateMBB];
+        updateUsers(Reg, NewReg, bSubRegDef, userMIs);
+      }
+    }
+
+    llvm::removeUnusedLanes(*NewDef.getInstr(), MRI, SIRI, SIII, SlotIndexes);
+  }
+  if (MRI.use_empty(Reg)) {
+    SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    Remat->RemoveInst(DefMI);
+    DefMI->eraseFromParent();
+  }
+}
+
+void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
+                            SlotIndexes *slotIndexes,
+                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  MachineInstr *DefMI = Node.DefMI;
+  MachineInstr *InsertPointMI = Node.InsertPointMI;
+  MachineBasicBlock* MBB = nullptr;
+
+  // Find a valid insert point.
+  MachineBasicBlock::iterator InsertPoint;
+  if (InsertPointMI) {
+    InsertPoint = InsertPointMI->getIterator();
+    MBB = InsertPointMI->getParent();
+  } else {
+    InsertPoint = Node.InsertBlock->getFirstTerminator();
+    MBB = Node.InsertBlock;
+  }
+
+  InsertPoint = AdjustInsertPointToAvoidSccSmash(
+      DefMI, MBB, InsertPoint, MRI, SIRI, SIII
+  );
+  
+  // Move instruction to new location.
+  DefMI->removeFromParent();
+  InsertPoint->getParent()->insert(InsertPoint, DefMI);
+
+  // Update slot index.
+  slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+  slotIndexes->insertMachineInstrInMaps(*DefMI);
+}
+
+void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
+                std::vector<BlockLiveInfo> &hotBlocks,
+                MachineDominatorTree *pDT, SlotIndexes *slotIndexes,
+                MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                const SIInstrInfo *SIII, MachineFunction &MF) {
+  std::vector<RematNode> UpdateList;
+  for (auto &it : RematMap) {
+    UpdateList.emplace_back(it.second);
+  }
+  // Sort update list with slotIndex to make sure def moved before use.
+  // If use moved before def, it might not be the first use anymore.
+  std::sort(UpdateList.begin(), UpdateList.end(),
+            [&slotIndexes](RematNode &i, RematNode &j) {
+              SlotIndex a = slotIndexes->getInstructionIndex(*i.DefMI);
+              SlotIndex b = slotIndexes->getInstructionIndex(*j.DefMI);
+              return a < b;
+            });
+
+  for (RematNode &Node : UpdateList) {
+    if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
+      ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
+    } else if (Node.Kind == RematNode::RematKind::Clone) {
+      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+    }
+  }
+}
+
+void dumpRematMap(MapVector<unsigned, RematNode> &RematMap,
+                  const SIRegisterInfo *SIRI) {
+  dbgs() << "\n rematMap: \n";
+  for (auto it : RematMap) {
+    int Reg = it.first;
+    dbgs() << printReg(Reg, SIRI);
+    dbgs() << "\n";
+  }
+}
+
+int DebugBlockIndex = 42;
+
+void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet,
+                  MapVector<unsigned, RematNode> &VRematMap,
+                  MapVector<unsigned, RematNode> &SRematMap, int BlockIndex,
+                  const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  llvm::dumpLiveSet(LiveSet, SIRI);
+  dumpRematMap(VRematMap, SIRI);
+  dumpRematMap(SRematMap, SIRI);
+}
+
+void dumpCandidates(std::vector<RematNode> &RematCandidates, int BlockIndex,
+                    const SIRegisterInfo *SIRI) {
+  if (DebugBlockIndex != BlockIndex)
+    return;
+  dbgs() << "\n Candidates: \n";
+  unsigned TotalSize = 0;
+  for (RematNode &Node : RematCandidates) {
+    dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size;
+    dbgs() << "\n";
+    TotalSize += Node.Size;
+  }
+  dbgs() << "Total Size:" << TotalSize << "\n";
+}
+
+} // namespace
+
+bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+                   LiveIntervals *LIS, MachineDominatorTree *pDT,
+                   MachinePostDominatorTree *pPDT, bool &bNearTarget) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  DenseMap<MachineBasicBlock *, unsigned> RPOTIndexMap;
+  for (MachineBasicBlock *MBB : RPOT) {
+    RPOTIndexMap[MBB] = RPOTIndexMap.size();
+  }
+
+  auto &MRI = MF.getRegInfo();
+
+  bool bUpdated = false;
+  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (status.TargetOcc >= MaxOcc)
+    return false;
+
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  int rematSCnt = status.MaxSPressure - SLimit;
+  // when agressive sgpr remat, reserve some for allocation lost.
+  if (EnableAggressive)
+    rematSCnt += NearTargetRegLimit;
+
+  bool bSGPRSpill = false;
+  if (rematSCnt > 0) {
+    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  }
+
+  bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+
+  // If bound by lds, skip.
+  if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+      !bForceRematSgpr)
+    return false;
+
+  MachineBasicBlock *EntryMBB = &MF.front();
+
+  auto *SlotIndexes = LIS->getSlotIndexes();
+
+  // Reg which already marked remat.
+  MapVector<unsigned, RematNode> VRematMap;
+  MapVector<unsigned, RematNode> SRematMap;
+  // Reg which cannot move around to remat.
+  DenseSet<unsigned> PinnedRegSet;
+  std::vector<BlockLiveInfo> hotBlocks;
+  for (auto it = po_begin(EntryMBB); it != po_end(EntryMBB); it++) {
+    MachineBasicBlock *MBB = *it;
+    auto &RP = status.MBBPressureMap[MBB];
+    // ignore block not hot.
+    if (RP.getVGPRNum(ST->hasGFX90AInsts()) < status.TargetVLimit &&
+        (RP.getMaxSGPR() + RegForVCC + status.InputPhysicalSPressure) <
+            status.TargetSLimit)
+      continue;
+    // Collect reg pressure.
+    unsigned maxVPressure = 0;
+    unsigned maxSPressure = 0;
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+    LLVM_DEBUG(
+        dumpHotBlock(inputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI));
+
+    GCNDownwardRPTracker Tracker(*LIS);
+
+    Tracker.reset(*MBB->begin(), &inputLive);
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      Tracker.advance();
+      auto LISLR = Tracker.getLiveRegs();
+      // Update live set for things already remated.
+      UpdateLiveInfo(VRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+      UpdateLiveInfo(SRematMap, LISLR, inputLive, MBB, RPOTIndexMap);
+
+      const GCNRPTracker::LiveRegSet &liveSet = LISLR;
+      unsigned VPressure = 0;
+      unsigned SPressure = 0;
+      CollectLiveSetPressure(liveSet, MRI, SIRI, VPressure, SPressure);
+      if (maxVPressure < VPressure)
+        maxVPressure = VPressure;
+      if (maxSPressure < SPressure)
+        maxSPressure = SPressure;
+    }
+    maxSPressure += RegForVCC + status.InputPhysicalSPressure;
+    if (maxVPressure <= VLimit && maxSPressure <= SLimit)
+      continue;
+
+    // Build block live info.
+    // Use outputLive for EntryMBB.
+    BlockLiveInfo LiveInfo = {MBB, maxSPressure, maxVPressure,
+                              MBB != EntryMBB ? inputLive : outputLive};
+    // Skip entry block when save hotBlock to reduce clone because not clone in
+    // entry block.
+    if (MBB != EntryMBB)
+      hotBlocks.emplace_back(LiveInfo);
+    GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.inputLive;
+
+    // Update reg pressure based on remat list.
+    InstSet VReducedInsts;
+    InstSet SReducedInsts;
+    int VReduced =
+        GetReducedSize(VRematMap, /*bVGPR*/ true, CandidateRegs, VReducedInsts,
+                       MRI, SIRI, LiveInfo, RPOTIndexMap);
+    int SReduced =
+        GetReducedSize(SRematMap, /*bVGPR*/ false, CandidateRegs, SReducedInsts,
+                       MRI, SIRI, LiveInfo, RPOTIndexMap);
+
+    // Calculate size need to be remat.
+    int rematVCnt = maxVPressure - VReduced - VLimit;
+    int rematSCnt = maxSPressure - SReduced - SLimit;
+
+    bool bSGPRSpill = false;
+    if (rematSCnt > 0) {
+      bSGPRSpill = nearSgprSpill(maxSPressure, ST, MF);
+    }
+    bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
+    // Try to add candidates into remat list.
+
+    int newRematSCnt = 0;
+    if (rematSCnt > 0) {
+      // Build candidate nodes.
+      std::vector<RematNode> SRematCandidates;
+      BuildRematCandiates(SRematCandidates, CandidateRegs, PinnedRegSet, MRI,
+                          SIII, SIRI, /*bVGPR*/ false);
+
+      LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI));
+      std::vector<RematNode> SRematList;
+      // Filter candidates.
+      newRematSCnt =
+          FilterRematCandiates(SRematCandidates, SRematList, PinnedRegSet, pDT,
+                               pPDT, MLI, MRI, SIRI, MF, SlotIndexes,
+                               /*bVGPR*/ false, status.bMemBound);
+      if (newRematSCnt > rematSCnt) {
+        // Has enough remat node to cover rematCnt.
+        int rematCnt = 0;
+        for (RematNode &Node : SRematList) {
+          SRematMap[Node.Reg] = Node;
+          rematCnt += Node.Size;
+          if (rematCnt > rematSCnt && !EnableAggressive)
+            break;
+        }
+        newRematSCnt = 0;
+      } else {
+
+        for (RematNode &Node : SRematList) {
+          SReducedInsts.insert(Node.DefMI);
+        }
+        // Check shared size.
+        int SharedReducedSize =
+            GetSharedReducedSize(SReducedInsts, /*bVGPR*/ false, MRI, SIRI);
+        if (((newRematSCnt + SharedReducedSize) + (int)NearTargetRegLimit) >=
+            rematSCnt) {
+          for (RematNode &Node : SRematList) {
+            SRematMap[Node.Reg] = Node;
+          }
+        } else {
+          if (!bForceRematSgpr) {
+            return false;
+          } else {
+            for (RematNode &Node : SRematList) {
+              SRematMap[Node.Reg] = Node;
+            }
+            // Find local one def one use candidates.
+            for (MachineInstr &MI : *MBB) {
+              if (MI.isDebugInstr())
+                continue;
+              if (MI.getDesc().NumDefs != 1)
+                continue;
+              MachineOperand &DstMO = MI.getOperand(0);
+              Register Reg = DstMO.getReg();
+              if (!SIRI->isSGPRReg(MRI, Reg))
+                continue;
+              if (!MRI.hasOneNonDBGUse(Reg))
+                continue;
+              if (!MRI.hasOneDef(Reg))
+                continue;
+              if (Reg.isPhysical())
+                continue;
+              MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
+              if (UseMI.getParent() != MBB)
+                continue;
+              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+              if (gain > 0) {
+                // Skip case when DefMI has implicit define which used by UseMI.
+                if (isImplicitDefUse(&MI, &UseMI)) {
+                  continue;
+                }
+                RematNode Node = {Reg, &MI, (unsigned)gain >> 5};
+                Node.InsertPointMI = &UseMI;
+                Node.Kind = RematNode::RematKind::OneDefOneUse;
+                SRematMap[Reg] = Node;
+                SharedReducedSize += Node.Size;
+              }
+            }
+          }
+        }
+        newRematSCnt = rematSCnt - newRematSCnt - SharedReducedSize;
+      }
+    }
+    // If works, continue.
+
+    // Collect live range from hot inst.
+    // find common live range in hot insts.
+    // Remat these common live range.
+    // Apply the remat.
+
+    int newRematVCnt = 0;
+    if (rematVCnt > 0) {
+      // TODO: V remat.
+    }
+
+    bool bNeedSRemat = rematSCnt > 0;
+    bool bNeedVRemat = rematVCnt > 0;
+    // If sgpr spill, always do remat.
+    bool bSRematOK =
+        (newRematSCnt <= 0 && !SRematMap.empty()) ||
+        bForceRematSgpr;
+    bool bVRematOK =
+        (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
+    if (bNeedSRemat && bNeedVRemat) {
+      if (bVRematOK && bSRematOK) {
+        bUpdated = true;
+      } else if (bSGPRSpill) {
+        bUpdated = true;
+      }
+    } else if (bNeedSRemat) {
+      if (bSRematOK) {
+        bUpdated = true;
+      }
+    } else if (bNeedVRemat) {
+      if (bVRematOK) {
+        bUpdated = true;
+      }
+    }
+    // TODO: what to do when cannot reach target?
+    if (newRematSCnt > 0) {
+      if (newRematSCnt <= NearTargetRegLimit) {
+        bNearTarget = true;
+      } else {
+        if (!bSGPRSpill)
+          return false;
+      }
+    }
+  }
+
+  if (SRematMap.empty() && VRematMap.empty()) {
+    return bUpdated;
+  }
+
+  if (!SRematMap.empty()) {
+    bUpdated = true;
+    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+    LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
+  }
+
+  // Balance between vector and scalar if possible.
+  return bUpdated;
+}
+
+namespace {
+bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  DenseSet<MachineInstr *> DefMIs;
+  for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+    // skip implicit def.
+    if (DefMI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+      continue;
+    DefMIs.insert(&DefMI);
+  }
+  return DefMIs.size() == 1;
+}
+
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
+{
+    if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
+    {
+        return false;
+    }
+
+    return MO.getReg() == Reg;
+}
+
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
+{
+    if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
+    {
+        return false;
+    }
+
+    return MO.getReg() == Reg;
+}
+
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
+{
+    // Make sure UseMI is not wqm like sample.
+    if (SIII->isWQM(UseMI->getOpcode()))
+        return false;
+    if (UseMI->getOpcode() == AMDGPU::PHI)
+        return false;
+    
+    return true;
+}
+
+static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
+  return MI.isConvergent() &&
+    // This flag is set on readfirstlane's to indicate that they
+    // are redundant (the value being read is already uniform).
+    // Normally, readfirstlanes are convergent, because different exec
+    // will cause a different value to be read; a known uniform
+    // readfirstlane is safe to move or clone and not actually convergent.
+    !Remat->TotalUniformInsts.count(&MI);
+}
+
+bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
+                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+  if (Reg.isPhysical())
+    return false;
+  bool bVGPR = SIRI->isVGPR(MRI, Reg);
+
+  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+  if (!DefMI)
+    return false;
+  if (DefMI->getOpcode() == AMDGPU::PHI)
+    return false;
+
+  // Skip convergent.
+  if (isConvergent(Remat, *DefMI))
+    return false;
+
+  // Skip inst has more than 1 def.
+  if (DefMI->getDesc().NumDefs > 1)
+    return false;
+
+  unsigned OpNum = DefMI->getNumOperands();
+
+  // Only move DefMI which all operand is unique def.
+  for (unsigned i = 0; i < OpNum; i++) {
+    MachineOperand &Op = DefMI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    Register OpReg = Op.getReg();
+    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+      continue;
+    if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
+      continue;
+    // Alow unused scc define.
+    if (Op.isImplicit() && Op.isDead() && Op.isDef())
+      continue;
+    if (OpReg.isPhysical())
+      return false;
+    if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+      return false;
+    }
+  }
+
+  if (bVGPR && bSink) {
+    // Skip mem related inst.
+    if (DefMI->mayLoadOrStore()) {
+      return false;
+    }
+
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (!IsSafeRematCandidateUser(&UseMI, SIII))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+std::vector<SubExp> buildSubExpFromCandidates(
+    Remat *Remat,
+    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus,
+    bool bAllowPartialUseInSubExp) {
+  InstSet CandidateDefs;
+  DenseSet<unsigned> RemovedCandidates;
+  std::vector<unsigned> CandidateRegs;
+  CandidateRegs.reserve(Candidates.size());
+  for (auto it : Candidates) {
+    unsigned Reg = it.first;
+    CandidateRegs.emplace_back(Reg);
+  }
+  // Sort candidate by defMI order to make sure defMI has dependent check after
+  // all its dependent node.
+  std::sort(CandidateRegs.begin(), CandidateRegs.end(),
+            [&MRI, &slotIndexes](const unsigned a, unsigned b) {
+              MachineInstr *MIa = MRI.getUniqueVRegDef(a);
+
+              MachineInstr *MIb = MRI.getUniqueVRegDef(b);
+              // Later instr first.
+              return !SlotIndex::isEarlierInstr(
+                  slotIndexes->getInstructionIndex(*MIa),
+                  slotIndexes->getInstructionIndex(*MIb));
+            });
+
+  // If Candidate def has user in MBB, add it when allow partial candidates.
+  // And the subExp has the define could only be clone, cannot move cross blocks
+  // because user in MBB.
+  DenseSet<MachineInstr *> PartialCandidates;
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+  for (unsigned Reg : CandidateRegs) {
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+    bool bHasNoCandidatesSameBlockUser = false;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (UseMI.getParent() == MI->getParent()) {
+        if (UseMI.getNumExplicitDefs() == 1) {
+          // Skip user which already in Candidates.
+          unsigned UserDefReg = UseMI.getOperand(0).getReg();
+          if (Candidates.count(UserDefReg) > 0 &&
+              RemovedCandidates.count(UserDefReg) == 0)
+            continue;
+        }
+        if (!bAllowPartialUseInSubExp)
+          bHasNoCandidatesSameBlockUser = true;
+        else
+          PartialCandidates.insert(MI);
+        break;
+      }
+    }
+    if (bHasNoCandidatesSameBlockUser) {
+      RemovedCandidates.insert(Reg);
+      continue;
+    }
+    LLVM_DEBUG(MI->dump());
+    CandidateDefs.insert(MI);
+  }
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+  if (CandidateDefs.empty())
+    return std::vector<SubExp>();
+  for (unsigned Reg : RemovedCandidates) {
+    unUsedPassThrus[Reg] = Candidates[Reg];
+    Candidates.erase(Reg);
+  }
+
+  // iterate MBB backward.
+  // add inst which only used for candidate defines.
+  for (auto it = MBB->rbegin(); it != MBB->rend(); it++) {
+    MachineInstr &MI = *it;
+    if (CandidateDefs.count(&MI) > 0) {
+      continue;
+    }
+
+    if (isConvergent(Remat, MI))
+      continue;
+    // Skip if MI is not safe to move.
+    if (MI.getNumDefs() != 1) {
+      // allow to move unused implicit def.
+      bool bDeadImplictDef = false;
+      for (MachineOperand &MO : MI.implicit_operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        bDeadImplictDef = MO.isDead();
+      }
+      if (!bDeadImplictDef)
+        continue;
+    }
+
+    unsigned Reg = -1;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Reg = MO.getReg();
+      break;
+    }
+
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+      continue;
+
+    // If all users of MI are in candidate defs, add MI into candidate defs.
+    // If part of user of MI is in candidate defs, add MI into candidate defs
+    // when allow partialUse.
+    bool bAllUserInCandidate = true;
+    bool bHasCandidateUser = false;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (CandidateDefs.count(&UseMI) == 0)
+        bAllUserInCandidate = false;
+      else
+        bHasCandidateUser = true;
+    }
+    if (!bHasCandidateUser)
+      continue;
+    if (!bAllUserInCandidate) {
+      if (!bAllowPartialUseInSubExp)
+        continue;
+      PartialCandidates.insert(&MI);
+    }
+
+    CandidateDefs.insert(&MI);
+  }
+
+  // Collect input for CandidateDefs.
+  GCNRPTracker::LiveRegSet CandidateInput;
+  for (MachineInstr *MI : CandidateDefs) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (MO.isImplicit() && Reg.isPhysical())
+        continue;
+
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+             "UseMI should be safe to move");
+      if (DefMI && CandidateDefs.count(DefMI) > 0)
+        continue;
+      // Add to input.
+      CandidateInput[Reg] |= llvm::getRegMask(MO, MRI);
+    }
+  }
+
+  // Build defs in order.
+  std::vector<MachineInstr *> defs;
+  defs.reserve(CandidateDefs.size());
+  for (MachineInstr &MI : *MBB) {
+    MachineInstr *pMI = &MI;
+    if (CandidateDefs.count(pMI) == 0)
+      continue;
+    defs.emplace_back(pMI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+                                                            : defs) {
+    MI->dump();
+  } dbgs() << "\nFinished Candidate Defs End\n";);
+
+  // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+  // Candidates as output.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(CandidateInput, Candidates, defs);
+  if (bAllowPartialUseInSubExp) {
+    for (auto &subExp : dag.SubExps) {
+      for (auto *MI : subExp.SUnits) {
+        if (PartialCandidates.count(MI)) {
+          subExp.bCloneOnly = true;
+          break;
+        }
+      }
+    }
+  }
+  return dag.SubExps;
+}
+
+
+std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
+    Remat* Remat,
+    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
+  InstSet CandidateDefs;
+
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs:\n";);
+  for (auto it : Candidates) {
+    unsigned Reg = it.first;
+    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      if (isConvergent(Remat, UseMI))
+        continue;
+      MachineBasicBlock *UseMBB = UseMI.getParent();
+      if (UseMBB == MI->getParent())
+        continue;
+      assert(UseMBB == MBB && "block mismatch");
+      // If all operands in CandidateRegs, add to candidateDefs.
+      bool bHasOpRegNotInCandidates = false;
+      for (MachineOperand &MO : UseMI.operands()) {
+        if (!MO.isReg())
+          continue;
+        if (MO.isDef())
+          continue;
+        Register OpReg = MO.getReg();
+        if (MO.isImplicit() && OpReg.isPhysical())
+          continue;
+        if (Candidates.count(OpReg) == 0) {
+          bHasOpRegNotInCandidates = true;
+          break;
+        }
+      }
+      if (bHasOpRegNotInCandidates)
+        continue;
+
+      LLVM_DEBUG(UseMI.dump());
+      CandidateDefs.insert(&UseMI);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\nCandidate Defs End\n";);
+
+  if (CandidateDefs.empty())
+    return std::vector<SubExp>();
+
+  // iterate MBB.
+  GCNRPTracker::LiveRegSet LocalCandidates = Candidates;
+  // add inst which only used by candidate defines.
+  for (auto it = MBB->begin(); it != MBB->end(); it++) {
+    MachineInstr &MI = *it;
+    if (CandidateDefs.count(&MI) > 0) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        Register Reg = MO.getReg();
+        if (Reg.isPhysical())
+          continue;
+        LocalCandidates[Reg];
+      }
+      continue;
+    }
+
+    // Skip if MI is not safe to move.
+    if (isConvergent(Remat, MI))
+      continue;
+
+    if (MI.getNumDefs() != 1)
+      continue;
+
+    if (MI.mayLoadOrStore()) {
+      continue;
+    }
+
+    unsigned Reg = -1;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Reg = MO.getReg();
+      break;
+    }
+
+    // Still use bsink to skip mem load/store.
+    // if (!isSafeCandidate(Reg, MRI, SIRI, SIII, /*bSink*/true))
+    //  continue;
+
+    // If all user of MI is in candidate defs, add MI into candidate defs.
+    bool bAllOperandInCandidate = true;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register OpReg = MO.getReg();
+      if (LocalCandidates.count(OpReg))
+        continue;
+
+      if (MO.isImplicit() &&
+          (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO))
+        continue;
+      if (OpReg.isPhysical()) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      MachineInstr *OpMI = MRI.getUniqueVRegDef(OpReg);
+      if (!OpMI) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      if (CandidateDefs.count(OpMI) == 0) {
+        bAllOperandInCandidate = false;
+        break;
+      }
+      if (MO.isTied())
+        continue;
+    }
+    if (!bAllOperandInCandidate)
+      continue;
+    LLVM_DEBUG(llvm::dbgs() << "Add local candidates:";
+               pressure::print_reg(Reg, MRI, SIRI, llvm::dbgs()););
+    LocalCandidates[Reg];
+    CandidateDefs.insert(&MI);
+  }
+
+  // Collect input for CandidateDefs.
+  GCNRPTracker::LiveRegSet CandidateInput;
+  for (MachineInstr *MI : CandidateDefs) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+        continue;
+      if (Reg.isPhysical())
+        continue;
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      if (!DefMI) {
+        // Skip local def which is not unique.
+        if (MO.isTied())
+          continue;
+        if (Candidates.count(Reg) == 0 && LocalCandidates.count(Reg) != 0)
+          continue;
+      }
+      assert((DefMI || llvm::IsSub0Sub1SingleDef(Reg, MRI)) &&
+             "UseMI should be safe to move");
+      if (DefMI && CandidateDefs.count(DefMI) > 0)
+        continue;
+      // Add to input.
+      CandidateInput[Reg] = llvm::getRegMask(MO, MRI);
+    }
+  }
+
+  // Build defs in order.
+  std::vector<MachineInstr *> defs;
+  defs.reserve(CandidateDefs.size());
+  for (MachineInstr &MI : *MBB) {
+    MachineInstr *pMI = &MI;
+    if (CandidateDefs.count(pMI) == 0)
+      continue;
+    defs.emplace_back(pMI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
+                                                            : defs) {
+    MI->dump();
+  } dbgs() << "\nFinished Candidate Defs End\n";);
+
+  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
+                                                            : LocalCandidates) {
+    pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
+  } dbgs() << "\nLocalCandidates End\n";);
+  // Make sure all input reg are uniqueDef.
+  // Input is Candidates, output is?
+  // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
+  // Candidates as output.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(Candidates, LocalCandidates, defs);
+  return dag.SubExps;
+}
+
+
+void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
+  if (Reg.isVirtual()) {
+    StringRef Name = MRI.getVRegName(Reg);
+    if (Name != "") {
+      dbgs() << '%' << Name;
+    } else {
+      dbgs() << '%' << Register::virtReg2Index(Reg);
+    }
+  }
+}
+
+MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
+                                   const MachineRegisterInfo &MRI,
+                                   MachineDominatorTree *pDT) {
+  BlockSet userBlocks;
+  for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+    MachineBasicBlock *UserBB = UseMI.getParent();
+    // Skip current BB.
+    if (UserBB != FromBB)
+      userBlocks.insert(UserBB);
+    else
+      // When has user in FromBB, userBlock will be FromBB.
+      return nullptr;
+  }
+  if (userBlocks.empty())
+    return nullptr;
+  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+  if (!pDT->dominates(FromBB, userBlock)) {
+    return nullptr;
+  }
+  if (userBlock == FromBB)
+    return nullptr;
+  return userBlock;
+}
+
+void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
+                             MachineDominatorTree *pDT,
+                             SlotIndexes *slotIndexes,
+                             const SIInstrInfo *SIII,
+                             const SIRegisterInfo *SIRI) {
+  // Move from bottom.
+  MachineBasicBlock *FromBB = Exp.FromBB;
+  for (auto it = Exp.SUnits.rbegin(); it != Exp.SUnits.rend(); it++) {
+    MachineInstr *DefMI = *it;
+    if (DefMI->getNumExplicitDefs() != 1)
+      continue;
+
+    unsigned Reg = DefMI->getOperand(0).getReg();
+    MachineBasicBlock *ToBB = FindTargetBlock(Reg, FromBB, MRI, pDT);
+    if (!ToBB)
+      continue;
+
+    // Do not overwrite a live scc.
+    MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+    if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
+      continue;
+
+    DefMI->removeFromParent();
+    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+    ToBB->insert(InsertPoint, DefMI);
+    // Debug insts don't need slot index.
+    if (DefMI->isDebugInstr())
+      continue;
+    // Update slot index.
+    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    slotIndexes->insertMachineInstrInMaps(*DefMI);
+  }
+}
+
+
+void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
+                             MachineDominatorTree *pDT,
+                             SlotIndexes *slotIndexes,
+                             const SIInstrInfo *SIII,
+                             const SIRegisterInfo *SIRI) {
+  // Move from top.
+  // Find lowest input def.
+  MachineBasicBlock *ToBB = Exp.ToBB;
+  assert(!ToBB->empty() && "ToBB have instructions for define of input nodes");
+  auto Terminator = ToBB->getFirstTerminator();
+  if (Terminator == ToBB->end() && ToBB->succ_size() == 1) {
+    MachineInstr &EndMI = *ToBB->rbegin();
+    if (SIII->isSchedulingBoundary(EndMI, ToBB, *ToBB->getParent()))
+      // Insert before the scheduling boundary instruction.
+      Terminator = EndMI.getIterator();
+    else
+      // No boundary so just insert inst at the end of the block.
+      Terminator = ToBB->end();
+  }
+
+  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
+      Exp, ToBB, Terminator, MRI, SIRI, SIII
+  );
+
+  for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+    MachineInstr *DefMI = *it;
+    if (DefMI->getNumExplicitDefs() != 1)
+      continue;
+    if (SIII->isEXP(DefMI->getOpcode()))
+      continue;
+    if (DefMI->mayStore())
+      continue;
+    // Find def for DefMI operands as insert point.
+    DefMI->removeFromParent();
+    ToBB->insert(Terminator, DefMI);
+
+    // Debug insts don't need slot index.
+    if (DefMI->isDebugInstr())
+      continue;
+    // Update slot index.
+    slotIndexes->removeSingleMachineInstrFromMaps(*DefMI);
+    slotIndexes->insertMachineInstrInMaps(*DefMI);
+  }
+}
+
+DenseSet<MachineInstr *> buildCloneSet(ExpDag &dag,
+                                       DenseSet<SUnit *> &dagBottoms,
+                                       GCNRPTracker::LiveRegSet &usedOutput) {
+  DenseSet<MachineInstr *> copySet;
+  for (auto it = dag.SUnits.rbegin(); it != dag.SUnits.rend(); it++) {
+    SUnit &SU = *it;
+    // Skip non-inst node.
+    if (!SU.isInstr())
+      continue;
+    MachineInstr *MI = SU.getInstr();
+    if (dagBottoms.find(&SU) != dagBottoms.end()) {
+      bool bUsed = false;
+      // For bottom SU, if in usedOutput, add to copySet;
+      for (MachineOperand &DefMO : MI->defs()) {
+        if (!DefMO.isReg())
+          continue;
+        unsigned Reg = DefMO.getReg();
+        if (usedOutput.count(Reg) > 0) {
+          bUsed = true;
+          break;
+        }
+      }
+      if (bUsed) {
+        copySet.insert(MI);
+        continue;
+      }
+      // bottom SU may still have succNode when it used both inExp and outExp.
+      // So continue check succNode.
+    }
+
+    // If any SuccNode is in copySet, add to copySet.
+    bool bSuccCopied = false;
+    for (SDep &SucDep : SU.Succs) {
+      SUnit *SucSU = SucDep.getSUnit();
+      MachineInstr *SuccMI = SucSU->getInstr();
+      if (copySet.count(SuccMI) > 0) {
+        bSuccCopied = true;
+        break;
+      }
+    }
+    if (bSuccCopied)
+      copySet.insert(MI);
+  }
+  return copySet;
+}
+
+void updateUsers(SmallVector<MachineInstr *, 2> &userMIs,
+                 DenseMap<unsigned, unsigned> &RegMap) {
+
+  for (MachineInstr *UserMI : userMIs) {
+    for (MachineOperand &MO : UserMI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      auto it = RegMap.find(Reg);
+      if (it == RegMap.end())
+        continue;
+      unsigned NewReg = it->second;
+      MO.setReg(NewReg);
+    }
+  }
+}
+
+struct HotBlock {
+  MachineBasicBlock *MBB = nullptr;
+  GCNRPTracker::LiveRegSet inputLive;
+  std::pair<unsigned, unsigned> maxPressures;
+  // Info about vmemLd.
+  int vmemLdInputSize;
+  int vmemLdOutputSize;
+};
+
+DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
+    SubExp &Exp,
+    MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &userBlocks,
+    DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &userBlocksLiveRegs,
+    std::vector<HotBlock> &hotBlocks, MachineDominatorTree *pDT) {
+  // Collect hot blocks which Exp is live in.
+  DenseSet<MachineBasicBlock *> hotBlockSet;
+  for (HotBlock &hotBlock : hotBlocks) {
+    for (unsigned Reg : Exp.BottomRegs) {
+      if (hotBlock.inputLive.count(Reg)) {
+        hotBlockSet.insert(hotBlock.MBB);
+        break;
+      }
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseSet<MachineBasicBlock *> afterHotRangeMBBs;
+  for (auto it : userBlocksLiveRegs) {
+    MachineBasicBlock *MBB = it.first;
+    // Always clone in hot block.
+    if (hotBlockSet.count(MBB))
+      continue;
+
+    bool bDomAllHotBlocks = true;
+    bool bDomedByAllHotBlocks = true;
+    for (MachineBasicBlock *hotMBB : hotBlockSet) {
+      if (!pDT->dominates(MBB, hotMBB)) {
+        bDomAllHotBlocks = false;
+      }
+      if (!pDT->dominates(hotMBB, MBB)) {
+        bDomedByAllHotBlocks = false;
+      }
+      if (!bDomAllHotBlocks && !bDomedByAllHotBlocks) {
+        break;
+      }
+    }
+    if (bDomAllHotBlocks) {
+      userBlocks.erase(MBB);
+    } else if (bDomedByAllHotBlocks) {
+      afterHotRangeMBBs.insert(MBB);
+    }
+  }
+
+  // Split after hotRange block set by domtree.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap;
+  if (!afterHotRangeMBBs.empty()) {
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      for (auto it2 : afterHotRangeMBBs) {
+        MachineBasicBlock *MBB2 = it2;
+        if (MBB == MBB2)
+          continue;
+        if (pDT->dominates(MBB, MBB2)) {
+          auto &Dom = DomMap[MBB];
+          Dom.insert(MBB2);
+          auto &Dom2 = DomMap[MBB2];
+          Dom.insert(Dom2.begin(), Dom2.end());
+        }
+      }
+    }
+    for (auto it : afterHotRangeMBBs) {
+      MachineBasicBlock *MBB = it;
+      auto &usedOutput = userBlocksLiveRegs[MBB];
+      auto &Dom = DomMap[MBB];
+      for (MachineBasicBlock *domedMBB : Dom) {
+        // Merge domed use to MBB use.
+        mergeLiveRegSet(usedOutput, userBlocksLiveRegs[domedMBB]);
+        // Remove domedMBB.
+        DomMap.erase(domedMBB);
+        userBlocksLiveRegs.erase(domedMBB);
+      }
+    }
+  }
+
+  return DomMap;
+}
+
+void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
+                              MachineDominatorTree *pDT,
+                              MachineRegisterInfo &MRI,
+                              SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+                              const SIRegisterInfo *SIRI) {
+  MapVector<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> userBlocks;
+  DenseMap<MachineBasicBlock *, GCNRPTracker::LiveRegSet> userBlocksLiveRegs;
+  for (unsigned Reg : Exp.BottomRegs) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB == Exp.FromBB)
+        continue;
+
+      userBlocks[UserBB].emplace_back(&UseMI);
+      auto &userLives = userBlocksLiveRegs[UserBB];
+      for (MachineOperand &MO : UseMI.uses()) {
+        if (!MO.isReg())
+          continue;
+        unsigned UseReg = MO.getReg();
+        if (Reg != UseReg)
+          continue;
+        userLives[Reg] |= getRegMask(MO, MRI);
+      }
+    }
+  }
+  // Build dag for SubExp to help remove unused inst when clone.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ true);
+  dag.build(Exp.inputLive, Exp.outputLive, Exp.SUnits);
+  DenseSet<SUnit *> dagBottoms;
+  for (SUnit &SU : dag.SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (SU.NumSuccs == 0) {
+      dagBottoms.insert(&SU);
+    } else {
+      MachineInstr *MI = SU.getInstr();
+      // Add SU which def value in Exp.outputLive.
+      for (MachineOperand &DefMO : MI->defs()) {
+        if (!DefMO.isReg())
+          continue;
+        unsigned Reg = DefMO.getReg();
+        if (Exp.BottomRegs.count(Reg) > 0) {
+          dagBottoms.insert(&SU);
+          break;
+        }
+      }
+    }
+  }
+
+  // For userBlocks which dominate all hotBlocks, don't need to clone because
+  // the value not cross hotBlocks when later blocks are cloned.
+  // For userBlocks which dominated by all hotBlocks, they could share clones
+  // because once after hot block, the pressure is OK.
+  DenseMap<MachineBasicBlock *, BlockSet> DomMap =
+      reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
+
+  // Sort to make stable order.
+  std::sort(userBlocks.begin(), userBlocks.end(),
+    [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
+      std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+        return it0.first->getNumber() < it1.first->getNumber();
+    });
+
+  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+  // Clone for each userBlocks. Not share clone thru dom tree which cannot help
+  // reg pressure.
+  for (auto it : userBlocks) {
+    MachineBasicBlock *MBB = it.first;
+    // Skip MBB which share clone from other MBBs.
+    if (userBlocksLiveRegs.count(MBB) == 0)
+      continue;
+    auto &usedOutput = userBlocksLiveRegs[MBB];
+    auto copySet = buildCloneSet(dag, dagBottoms, usedOutput);
+    // Clone to MBB.
+    // Create new regs first.
+    DenseMap<unsigned, unsigned> RegMap;
+    auto insertPtr = MBB->getFirstNonPHI();
+    // If Exp has scc read/write, make sure MBB not have scc in liveins.
+    if (bModifiesScc && llvm::IsSccLiveAt(MBB, insertPtr))
+      continue;
+    MachineFunction *MF = MBB->getParent();
+    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+      MachineInstr *DefMI = *it;
+      // Not clone if already in MBB.
+      if (DefMI->getParent() == MBB)
+        continue;
+      // Not clone if not used for MBB.
+      if (copySet.count(DefMI) == 0)
+        continue;
+
+      auto ClonedMI =
+          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+      for (MachineOperand &Def : DefMI->defs()) {
+        Register Reg = Def.getReg();
+        if (Reg.isPhysical()) {
+          if (Def.isImplicit())
+            continue;
+          ClonedMI.addDef(Reg, 0, Def.getSubReg());
+        } else {
+          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          RegMap[Reg] = NewReg;
+          ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+        }
+      }
+
+      for (MachineOperand &MO : DefMI->uses()) {
+        if (MO.isReg()) {
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical()) {
+            if (MO.isImplicit())
+              continue;
+            ClonedMI.addReg(Reg, 0, MO.getSubReg());
+          } else {
+            auto it = RegMap.find(Reg);
+            if (it == RegMap.end()) {
+              ClonedMI.addReg(Reg, 0, MO.getSubReg());
+            } else {
+              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+            }
+          }
+        } else {
+          ClonedMI.add(MO);
+        }
+      }
+
+      MachineInstr *NewDef = ClonedMI.getInstr();
+      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      // Set mem operand
+      for (MachineMemOperand *MO : DefMI->memoperands()) {
+        NewDef->addMemOperand(*MF, MO);
+      }
+    }
+
+    // update users in MBB.
+    SmallVector<MachineInstr *, 2> &userMIs = it.second;
+    updateUsers(userMIs, RegMap);
+
+    // update users in dom MBBs.
+    auto domMapIt = DomMap.find(MBB);
+    if (domMapIt != DomMap.end()) {
+      for (MachineBasicBlock *UpdateMBB : domMapIt->second) {
+        SmallVector<MachineInstr *, 2> &userMIs = userBlocks[UpdateMBB];
+        updateUsers(userMIs, RegMap);
+      }
+    }
+  }
+}
+
+
+void ApplySubExpCloneNearUserInBlock(
+    SubExp &Exp,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    MachineRegisterInfo &MRI, SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
+    const SIRegisterInfo *SIRI) {
+  MachineBasicBlock *MBB = Exp.FromBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *hotVMI = inBlockHotVInstMap[MBB];
+  MachineInstr *hotSMI = inBlockHotSInstMap[MBB];
+  // Exp is build with hotVMI or hotSMI, cannot mix.
+  assert(!(hotVMI && hotSMI) && "cannot mix hot MI");
+  MachineInstr *hotMI = hotVMI;
+  if (!hotMI) {
+    hotMI = hotSMI;
+  }
+
+  SlotIndex hotSlot = slotIndexes->getInstructionIndex(*hotMI).getBaseIndex();
+  const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
+
+  for (unsigned Reg : Exp.BottomRegs) {
+
+    SmallVector<MachineInstr *, 2> useMIs;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB != Exp.FromBB)
+        continue;
+      // Skip inst in Exp.
+      if (Exp.BottomRoots.find(&UseMI) != Exp.BottomRoots.end())
+        continue;
+      SlotIndex useSlot =
+          slotIndexes->getInstructionIndex(UseMI).getBaseIndex();
+      // Only clone for use after hot slot.
+      if (useSlot < hotSlot)
+        continue;
+
+      // Do not overwrite a live scc.
+      if (bModifiesScc && llvm::IsSccLiveAt(UserBB, &UseMI))
+        continue;
+
+      useMIs.emplace_back(&UseMI);
+    }
+    if (useMIs.empty())
+      continue;
+    DenseMap<unsigned, unsigned> RegMap;
+
+    std::sort(useMIs.begin(), useMIs.end(),
+              [&slotIndexes](const MachineInstr *MIa, const MachineInstr *MIb) {
+                return slotIndexes->getInstructionIndex(*MIa).getBaseIndex() <
+                       slotIndexes->getInstructionIndex(*MIb).getBaseIndex();
+              });
+    auto insertPtr = useMIs.front()->getIterator();
+
+    for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
+      MachineInstr *DefMI = *it;
+      auto ClonedMI =
+          BuildMI(*MBB, insertPtr, DefMI->getDebugLoc(), DefMI->getDesc());
+
+      for (MachineOperand &Def : DefMI->defs()) {
+        Register Reg = Def.getReg();
+        if (Reg.isPhysical()) {
+          ClonedMI.addDef(Reg, 0, Def.getSubReg());
+        } else {
+          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+          RegMap[Reg] = NewReg;
+          ClonedMI.addDef(NewReg, 0, Def.getSubReg());
+        }
+      }
+
+      for (MachineOperand &MO : DefMI->uses()) {
+        if (MO.isReg()) {
+          if (MO.isImplicit()) {
+            continue;
+          }
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical()) {
+            ClonedMI.addReg(Reg, 0, MO.getSubReg());
+          } else {
+            auto it = RegMap.find(Reg);
+            if (it == RegMap.end()) {
+              ClonedMI.addReg(Reg, 0, MO.getSubReg());
+            } else {
+              ClonedMI.addReg(it->second, 0, MO.getSubReg());
+            }
+          }
+        } else {
+          ClonedMI.add(MO);
+        }
+      }
+
+      MachineInstr *NewDef = ClonedMI.getInstr();
+      slotIndexes->insertMachineInstrInMaps(*NewDef);
+      // Set mem operand
+      for (MachineMemOperand *MO : DefMI->memoperands()) {
+        NewDef->addMemOperand(*MF, MO);
+      }
+    }
+    // TODO: only clone to cross hot range.
+    for (MachineInstr *UseMI : useMIs) {
+      for (MachineOperand &MO : UseMI->uses()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        auto it = RegMap.find(Reg);
+        if (it == RegMap.end())
+          continue;
+        unsigned NewReg = it->second;
+        MO.setReg(NewReg);
+      }
+    }
+  }
+}
+
+bool isInLiveSet(unsigned Reg, LaneBitmask mask,
+                 const GCNRPTracker::LiveRegSet &live) {
+  auto it = live.find(Reg);
+  if (it == live.end())
+    return false;
+
+  LaneBitmask liveMask = it->second;
+  return (liveMask | mask) == liveMask;
+}
+
+unsigned getPacifistLevel(unsigned Reg,
+                          DenseMap<MachineInstr *, unsigned> &pacifistLevels,
+                          const MachineRegisterInfo &MRI) {
+  unsigned level = 0;
+  for (MachineInstr &MI : MRI.def_instructions(Reg)) {
+    auto it = pacifistLevels.find(&MI);
+    if (it == pacifistLevels.end())
+      continue;
+    level = it->second;
+  }
+  return level;
+}
+
+bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
+                                  const MachineRegisterInfo &MRI) {
+  for (MachineInstr &def : MRI.def_instructions(Reg)) {
+    if (def.getParent() != MBB)
+      continue;
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *getInBlockUniqueDef(unsigned Reg, MachineBasicBlock *MBB,
+                                  const GCNRPTracker::LiveRegSet &inputLive,
+                                  const GCNRPTracker::LiveRegSet &outputLive,
+                                  const MachineRegisterInfo &MRI) {
+  MachineInstr *DefMI = nullptr;
+  // If live as input for MBB, cannot be unique def.
+  if (inputLive.count(Reg))
+    return DefMI;
+  for (MachineInstr &def : MRI.def_instructions(Reg)) {
+    if (def.getParent() != MBB)
+      continue;
+    if (DefMI) {
+      // Not unique.
+      DefMI = nullptr;
+      break;
+    }
+    DefMI = &def;
+  }
+  return DefMI;
+}
+
+bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
+                const GCNRPTracker::LiveRegSet &outputLive) {
+  return inputLive.count(Reg) && outputLive.count(Reg);
+}
+
+// Instructions which only use imm/passThru reg/output only reg will not kill any
+// live reg, so name them pacifist here.
+bool collectPacifist(MachineInstr &MI,
+                     const GCNRPTracker::LiveRegSet &inputLive,
+                     const GCNRPTracker::LiveRegSet &outputLive,
+                     const MachineRegisterInfo &MRI,
+                     const SIRegisterInfo *SIRI) {
+  // If has implicit def, not move.
+  if (MI.getDesc().NumImplicitDefs != 0)
+    return false;
+
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+      continue;
+    if (Reg.isPhysical())
+      return false;
+    // The def for reg must be unique def in block or pass thru which not has
+    // def in block. If not, it is not safe to move.
+    if (!(nullptr != getInBlockUniqueDef(Reg, MI.getParent(), inputLive,
+                                         outputLive, MRI) ||
+          (isPassThru(Reg, inputLive, outputLive) &&
+           !hasInBlockDef(Reg, MI.getParent(), MRI))))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+    if (isInLiveSet(Reg, mask, outputLive))
+      continue;
+
+    return false;
+  }
+  bool bHasDef = false;
+  for (MachineOperand &MO : MI.defs()) {
+    Register Reg = MO.getReg();
+
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    bHasDef = true;
+  }
+  // If no def, it will not increase pressure, don't mark it.
+  return bHasDef;
+}
+
+static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
+    MachineInstr &MI,
+    MachineBasicBlock &MBB,
+    AliasAnalysis *AA
+)
+{
+    if (MI.mayLoadOrStore())
+    {
+        for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
+        {
+            const bool UseTBAA = false;
+            if (MI.mayAlias(AA, *I, UseTBAA))
+            {
+                return &*I;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
+                            AliasAnalysis *AA,
+                            SlotIndexes *slotIndexes) {
+
+  SmallVector<MachineInstr *, 2> users;
+
+  // We cannot move the pacifist instruction past any memory
+  // op with which it aliases. Find the first instruction
+  // that aliases the pacifist MI (if any) and add it to the list
+  // of users. The sort() below will select the earliest user instruction.
+  if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+    users.push_back(AliasMI);
+  }
+
+  for (MachineOperand &MO : MI.defs()) {
+    unsigned Reg = MO.getReg();
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
+    {
+      if (&MBB != UseMI.getParent())
+        continue;
+      users.emplace_back(&UseMI);
+    }
+  }
+  if (users.empty())
+    return nullptr;
+
+  std::sort(users.begin(), users.end(),
+            [&slotIndexes](const MachineInstr *MIa, MachineInstr *MIb) {
+              // Early instr first.
+              return SlotIndex::isEarlierInstr(
+                  slotIndexes->getInstructionIndex(*MIa),
+                  slotIndexes->getInstructionIndex(*MIb));
+            });
+  return users.front();
+}
+
+// Pacifist inst will only add pressure since they don't kill.
+// Try to hold them as late as possible in a MBB to help pressure.
+bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                     MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, AliasAnalysis *AA,
+                     RematStatus &status) 
+{
+  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+  SmallVector<MachineInstr *, 32> pacifistList;
+  LLVM_DEBUG(dbgs() << "pacifist begin\n");
+  for (MachineInstr &MI : MBB) {
+    if (MI.isDebugInstr())
+      continue;
+    if (collectPacifist(MI, inputLive, outputLive, MRI, SIRI)) {
+      pacifistList.emplace_back(&MI);
+      LLVM_DEBUG(MI.dump());
+    }
+  }
+  LLVM_DEBUG(dbgs() << "pacifist end\n");
+
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  bool bUpdated = false;
+
+  // Move pacifist to its first user.
+  for (MachineInstr *MI : pacifistList) {
+    MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+    if (firstUser == MI)
+      continue;
+    if (firstUser == MI->getNextNode())
+      continue;
+
+    auto insertPoint = MBB.getFirstInstrTerminator();
+    if (firstUser) {
+      insertPoint = firstUser->getIterator();
+    } else {
+      // When there's no terminator.
+      if (insertPoint == MBB.end())
+        insertPoint--;
+      else
+        // BRANCH may have exec update before it.
+        insertPoint--;
+
+      insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+
+      while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
+              insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
+             insertPoint != MI->getIterator())
+      {
+        insertPoint--;
+        insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+      }
+      if (insertPoint == MI->getIterator())
+        continue;
+    }
+    // Do not overwrite a live scc.
+    if (WillSmashSccAtLocation(MI, &MBB, insertPoint))
+      continue;
+    MI->removeFromParent();
+    MBB.insert(insertPoint, MI);
+
+    LIS->handleMove(*MI);
+    bUpdated = true;
+  }
+
+  return bUpdated;
+}
+
+DenseMap<unsigned, MachineInstr *>
+collectUniformVgprs(Remat *Remat, MachineFunction &MF, MachineRegisterInfo &MRI,
+                    const SIRegisterInfo *SIRI) {
+  DenseMap<unsigned, MachineInstr *> UniformMap;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      if (!Remat->TotalUniformInsts.count(&MI))
+        continue;
+      if (MI.getNumDefs() != 1)
+        continue;
+      unsigned dstIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+      if (dstIdx == -1)
+        continue;
+      MachineOperand &DstMO = MI.getOperand(dstIdx);
+      if (DstMO.getSubReg() != 0)
+        continue;
+      if (DstMO.isTied())
+        continue;
+      unsigned Reg = DstMO.getReg();
+      if (MRI.getUniqueVRegDef(Reg) == nullptr)
+        continue;
+
+      auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+      if (SIRI->isSGPRClass(VRC))
+        continue;
+      // TODO: Support more reg class.
+      if (VRC != &AMDGPU::VGPR_32RegClass)
+        continue;
+
+      UniformMap[Reg] = &MI;
+    }
+  }
+  return UniformMap;
+}
+
+// Try insert readfirstlane on uniform vgpr to turn it in sgpr and save vgpr
+// pressure.
+bool collectVToSCrossHotSpot(
+    MachineBasicBlock &MBB, RematStatus &status,
+    DenseMap<unsigned, MachineInstr *> &UniformMap,
+    SmallMapVector<unsigned, MachineInstr *, 4> &VToSMap, LiveIntervals *LIS,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII) {
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+  GCNDownwardRPTracker Tracker(*LIS);
+
+  bool bUpdated = false;
+  const auto inputLive = status.MBBInputLiveMap[&MBB];
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (MachineInstr &MI : MBB) {
+    if (MI.isDebugInstr()) {
+      continue;
+    }
+
+    unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+    unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+    SPressure += RegForVCC;
+
+    Tracker.advance();
+    // Sgpr bound, vtos cannot help.
+    if (SPressure > SLimit)
+      return false;
+
+    if (VPressure <= VLimit) {
+      continue;
+    }
+
+    // Try to make all possible vtos to reduce vpressure.
+    int VExtra = VPressure - VLimit;
+
+    const GCNRPTracker::LiveRegSet &CurLives = Tracker.getLiveRegs();
+    for (auto it : CurLives) {
+      unsigned Reg = it.first;
+      auto UniformIt = UniformMap.find(Reg);
+      if (UniformIt == UniformMap.end())
+        continue;
+      VToSMap[UniformIt->first] = UniformIt->second;
+      VExtra--;
+      bUpdated = true;
+    }
+
+  }
+  return bUpdated;
+}
+
+// Return true if the user is outside of the def's loop.
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
+{
+  MachineLoop* L = MLI->getLoopFor(Def->getParent());
+  return L && !L->contains(User->getParent());
+}
+
+bool rematUniformVgprToSgpr(
+    Remat *Remat,
+    MachineFunction &MF, RematStatus &status,
+    DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
+    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+  DenseMap<unsigned, MachineInstr *> UniformVgprMap =
+      collectUniformVgprs(Remat, MF, MRI, SIRI);
+
+  SmallMapVector<unsigned, MachineInstr *, 4> VToSMap;
+
+  for (auto &hotBlock : hotBlocks) {
+    MachineBasicBlock &MBB = *hotBlock.MBB;
+    collectVToSCrossHotSpot(MBB, status, UniformVgprMap, VToSMap, LIS, MRI,
+                            SIRI, SIII);
+  }
+
+  if (VToSMap.empty())
+    return false;
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  const MCInstrDesc &ReadFirstLaneDesc = SIII->get(AMDGPU::V_READFIRSTLANE_B32);
+  for (auto it : VToSMap) {
+    unsigned Reg = it.first;
+    MachineInstr *MI = it.second;
+
+    auto *VRC = SIRI->getRegClassForReg(MRI, Reg);
+    // TODO: support bigger vgpr to sgpr.
+    if (VRC != &AMDGPU::VGPR_32RegClass)
+      continue;
+    auto *NewRC = SIRI->getEquivalentSGPRClass(VRC);
+    unsigned newDst = MRI.createVirtualRegister(NewRC);
+
+    auto ReadFirstLane =
+        BuildMI(MF, MI->getDebugLoc(), ReadFirstLaneDesc, newDst);
+    SmallVector<MachineInstr *, 2> userMIs;
+    for (MachineInstr &userMI : MRI.use_nodbg_instructions(Reg)) {
+      // Do not replace v->s across loops. Even if the value is uniform
+      // branch divergence can cause a uniform value in a loop to be
+      // non-uniform when used outside a loop.
+      if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+        userMIs.emplace_back(&userMI);
+    }
+
+    // Finish readfirstlane
+    ReadFirstLane.addReg(Reg);
+    MachineInstr *VToSMI = ReadFirstLane.getInstr();
+    Remat->TotalUniformInsts.insert(VToSMI);
+    Remat->SafeToRemoveInsts.insert(VToSMI);
+    MachineBasicBlock *MBB = MI->getParent();
+    MBB->insertAfter(MI->getIterator(), VToSMI);
+    slotIndexes->insertMachineInstrInMaps(*VToSMI);
+
+    for (MachineInstr *userMI : userMIs) {
+      const auto &Desc = userMI->getDesc();
+      bool bIllegal = false;
+      for (unsigned i=0;i<userMI->getNumOperands();i++) {
+        MachineOperand &MO = userMI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        if (MO.isDef())
+          continue;
+        if (MO.getReg() != Reg)
+          continue;
+        if (i >= Desc.getNumOperands()) {
+          bIllegal = true;
+          break;
+        }
+
+        MO.setReg(newDst);
+        if (userMI->getDesc().operands()[i].RegClass != -1) {
+          if (!SIII->isOperandLegal(*userMI, i, &MO)) {
+            SIII->legalizeOperands(*userMI);
+            // In case legalizeOperands not help, just legalize with mov.
+            if (userMI->getDesc().operands()[i].RegClass != -1 &&
+                !SIII->isOperandLegal(*userMI, i)) {
+              SIII->legalizeOpWithMove(*userMI, i);
+            }
+          }
+        } else {
+          // consider not have limit on reg class.
+        }
+      }
+      if (bIllegal)
+        continue;
+
+      auto rit = userMI->getReverseIterator();
+      rit++;
+      auto endIt = userMI->getParent()->rend();
+      while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+        slotIndexes->insertMachineInstrInMaps(*(rit++));
+    }
+  }
+
+  return true;
+}
+
+bool collectRematableHotReg(
+    MachineInstr &MI, const GCNRPTracker::LiveRegSet &hotLive,
+    GCNRPTracker::LiveRegSet &pureHotRematSet,
+    DenseMap<MachineInstr *, unsigned> &pureHotRematLevels, unsigned &DefReg,
+    const GCNRPTracker::LiveRegSet &inputLive,
+    const GCNRPTracker::LiveRegSet &outputLive, const MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI) {
+  // Ignore inst not have def or more than 1 def.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  DefReg = MI.defs().begin()->getReg();
+
+  unsigned level = 0;
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef())
+      continue;
+
+    Register Reg = MO.getReg();
+
+    // If user is in same MI like
+    //  %4:vgpr_32 = V_MAD_LEGACY_F32 %2:vgpr_32, %3:vgpr_32, %4:vgpr_32
+    // remat it will not help.
+    if (Reg == DefReg) {
+      return false;
+    }
+
+    if (MO.isImplicit() && (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO))
+      continue;
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+
+    if (isInLiveSet(Reg, mask, hotLive))
+      continue;
+
+    if (isInLiveSet(Reg, mask, pureHotRematSet)) {
+      unsigned regLevel = getPacifistLevel(Reg, pureHotRematLevels, MRI);
+      level = std::max(level, regLevel);
+      continue;
+    }
+
+    return false;
+  }
+
+  for (MachineOperand &MO : MI.defs()) {
+    Register Reg = MO.getReg();
+
+    if (Reg.isPhysical())
+      return false;
+
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+      return false;
+
+    LaneBitmask mask = llvm::getRegMask(MO, MRI);
+    pureHotRematSet[Reg] |= mask;
+  }
+
+  pureHotRematLevels[&MI] = level + 1;
+  // If no def, it will not increase pressure, don't mark it.
+  return true;
+}
+
+bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
+              std::vector<SubExp> &inBlockCloneSubExps, bool bVGPR,
+              const GCNRPTracker::LiveRegSet &inputLive,
+              const GCNRPTracker::LiveRegSet &outputLive,
+              DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
+              unsigned VLimit, unsigned SLimit,
+              const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+              LiveIntervals *LIS,
+              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+              const SIInstrInfo *SIII) {
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+
+  GCNRPTracker::LiveRegSet hotLive = LISLR;
+
+  GCNRPTracker::LiveRegSet pureHotRematSet;
+  std::vector<MachineInstr *> pureHotRematList;
+  DenseMap<MachineInstr *, unsigned> pureHotRematLevels;
+
+  GCNRPTracker::LiveRegSet outputSet;
+  LLVM_DEBUG(dbgs() << "pure hot remat begin\n");
+  // Find reg which could remat from other reg in liveSet.
+  const unsigned kMaxRematLevel = 6;
+  GCNDownwardRPTracker Tracker(*LIS);
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (auto it = MBB.begin(); it != MBB.end(); it++) {
+    MachineInstr &MI = *it;
+    const GCNRegPressure &RP = Tracker.getPressure();
+
+    if (MI.isDebugInstr())
+      continue;
+
+    // Igonre inst in hot range.
+    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+      Tracker.advance();
+      continue;
+    }
+
+    // Stop at hotMI.
+    if (&MI == hotMI)
+      break;
+
+    Tracker.advance();
+
+    unsigned DefReg = 0;
+    if (collectRematableHotReg(MI, hotLive, pureHotRematSet, pureHotRematLevels,
+                               DefReg, inputLive, outputLive, MRI, SIRI)) {
+      unsigned level = pureHotRematLevels[&MI];
+      if (level >= kMaxRematLevel)
+        continue;
+
+      // If the def reg is in hot reg.
+      // Add to output.
+      if (hotLive.find(DefReg) != hotLive.end()) {
+        bool bUserIsHot = false;
+        for (MachineInstr &UseMI : MRI.use_nodbg_instructions(DefReg)) {
+          if (UseMI.getParent() != &MBB)
+            continue;
+          if (0 == hotSet.count(&UseMI))
+            continue;
+
+          const auto &useSI = LIS->getInstructionIndex(UseMI).getBaseIndex();
+          // When has a hot user after hotMI, remat it may not help.
+          if (useSI > SI) {
+            bUserIsHot = true;
+            break;
+          }
+        }
+
+        if (bUserIsHot)
+          continue;
+        outputSet[DefReg];
+        LLVM_DEBUG(dbgs() << "hotRemat:");
+        LLVM_DEBUG(MI.getOperand(0).dump());
+        // remove it from hotLive to avoid it as input when build dag.
+        hotLive.erase(DefReg);
+      }
+      pureHotRematList.emplace_back(&MI);
+      LLVM_DEBUG(dbgs() << "level:" << level);
+      LLVM_DEBUG(MI.dump());
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "pure hot remat end\n");
+
+  // Create input/output for pure hot remat.
+  // Input is things hot reg in level 1 and output is things level > 1.
+  // Build SubExp with pureHotRematList as Nodes, hotLive as input
+  // rematHot as output.
+  // Not join input when build ExpDag to get small subExps.
+  ExpDag dag(MRI, SIRI, SIII, /*bJoinInput*/ false);
+  dag.build(hotLive, outputSet, pureHotRematList);
+  // Find best subExp add to inBlockCloneSubExps.
+  // Sort by size of subExp.
+  std::sort(dag.SubExps.begin(), dag.SubExps.end(),
+            [](const SubExp &a, const SubExp &b) {
+              return a.SUnits.size() < b.SUnits.size();
+            });
+  std::vector<SubExp> cloneSubExps;
+  int distance = bVGPR ? vDistance : sDistance;
+  for (SubExp &subExp : dag.SubExps) {
+    if (subExp.bNotSafeToCopy)
+      continue;
+    if (bVGPR) {
+      if (subExp.vOutputSize == 0)
+        continue;
+    } else {
+      if (subExp.sOutputSize == 0)
+        continue;
+    }
+    if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+      continue;
+    // Not clone big subExp.
+    if (subExp.SUnits.size() > 10)
+      continue;
+    // Do not allow remat in the block when the expression has a memory op and
+    // the block has a write. We could allow this in some cases with better
+    // analysis.
+    if (subExp.bHasMemInst && MemWriteMBBSet.count(&MBB))
+      continue;
+    if (bVGPR) {
+      distance -= subExp.vOutputSize;
+    } else {
+      distance -= subExp.sOutputSize;
+    }
+    cloneSubExps.emplace_back(subExp);
+    if (distance <= 0)
+      break;
+  }
+  if (distance <= 0) {
+    inBlockCloneSubExps.insert(inBlockCloneSubExps.end(), cloneSubExps.begin(),
+                               cloneSubExps.end());
+  }
+  return distance <= 0;
+}
+
+// Try to remat live reg in hot spot from other live reg in hot spot.
+//
+bool tryRematInHotSpot(
+    MachineBasicBlock &MBB, RematStatus &status, int vDistance, int sDistance,
+    int vSaved, int sSaved, std::vector<SubExp> &inBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
+
+  const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
+
+  // Collect reg pressure.
+  unsigned maxLocalVPressure = 0;
+  unsigned maxLocalSPressure = 0;
+  // Build a DAG or only on demand?
+  MachineInstr *hotVMI = nullptr;
+  MachineInstr *hotSMI = nullptr;
+  DenseSet<MachineInstr *> hotSet;
+
+  GCNDownwardRPTracker Tracker(*LIS);
+
+  Tracker.reset(*MBB.begin(), &inputLive);
+  for (auto it = MBB.begin(); it != MBB.end(); it++) {
+    MachineInstr &MI = *it;
+    if (MI.isDebugInstr()) {
+      continue;
+    }
+
+    unsigned VPressure = Tracker.getPressure().getVGPRNum(ST.hasGFX90AInsts());
+    unsigned SPressure = Tracker.getPressure().getMaxSGPR();
+
+    SPressure += RegForVCC;
+
+    VPressure -= vSaved;
+    SPressure -= sSaved;
+    Tracker.advance();
+
+    if (VPressure <= VLimit && SPressure <= SLimit) {
+      continue;
+    }
+    hotSet.insert(&MI);
+    if (maxLocalVPressure < VPressure) {
+      maxLocalVPressure = VPressure;
+      hotVMI = &MI;
+    }
+    if (maxLocalSPressure < SPressure) {
+      maxLocalSPressure = SPressure;
+      hotSMI = &MI;
+    }
+  }
+
+  inBlockHotVInstMap[&MBB] = hotVMI;
+  inBlockHotSInstMap[&MBB] = hotSMI;
+  if (vDistance > 0 && hotVMI) {
+    // Use hotVMI when apply.
+    inBlockHotSInstMap[&MBB] = nullptr;
+    if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
+                    outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+                    status.MemWriteMBBSet,
+                    LIS, MRI, SIRI, SIII))
+      return true;
+  }
+
+  if (sDistance > 0 && hotSMI) {
+    // Use hotSMI when apply.
+    inBlockHotSInstMap[&MBB] = hotSMI;
+    inBlockHotVInstMap[&MBB] = nullptr;
+    return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
+                    inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
+                    SLimit, status.MemWriteMBBSet,
+                    LIS, MRI, SIRI, SIII);
+  }
+  return false;
+}
+// Sort subExpCandidates to make sure deeper subExp apply first.
+// If subExp0 use result of subExp1, subExp0 is deeper than subExp1.
+// When apply subExp1 before subExp0, new clone of subExp0 which use result of
+// subExp1 will have old reg of subExp1. And reg pressure will not be reduced.
+void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
+  MapVector<unsigned, SetVector<SubExp *>> inputMap;
+  MapVector<unsigned, SetVector<SubExp *>> outputMap;
+  struct SortNode {
+    SubExp Exp;
+    unsigned Depth;
+    bool bDepthDirty;
+    SmallDenseSet<SubExp *, 2> Preds;
+    SmallDenseSet<SubExp *, 2> Succs;
+  };
+
+  {
+    SmallVector<unsigned, 10> RegSortStorage;
+    for (SubExp &Exp : subExpCandidates) {
+      RegSortStorage.assign(Exp.TopRegs.begin(), Exp.TopRegs.end());
+      std::sort(RegSortStorage.begin(), RegSortStorage.end());
+      for (auto it : RegSortStorage) {
+        unsigned Reg = it;
+        inputMap[Reg].insert(&Exp);
+      }
+
+      RegSortStorage.assign(Exp.BottomRegs.begin(), Exp.BottomRegs.end());
+      std::sort(RegSortStorage.begin(), RegSortStorage.end());
+      for (auto it : RegSortStorage) {
+        unsigned Reg = it;
+        outputMap[Reg].insert(&Exp);
+      }
+    }
+  }
+
+  MapVector<SubExp *, SortNode> sortMap;
+  for (auto it : inputMap) {
+    unsigned Reg = it.first;
+    auto outIt = outputMap.find(Reg);
+    if (outIt == outputMap.end())
+      continue;
+    auto &inExps = it.second;
+    auto &outExps = outIt->second;
+    for (SubExp *inExp : inExps) {
+      for (SubExp *outExp : outExps) {
+        if (inExp->bHoist != outExp->bHoist) {
+          // Different direction.
+          // If output (def) move up, input(use) move down, nothing happens.
+          if (outExp->bHoist)
+            continue;
+          // Canot input(use) move up, output(def) move down.
+          // Choose the exp which save more.
+          int inExpGain = inExp->vOutputSize - inExp->vInputSize;
+          int outExpGain = outExp->vInputSize - inExp->vOutputSize;
+          if (inExpGain >= outExpGain) {
+            outExp->SUnits.clear();
+          } else {
+            inExp->SUnits.clear();
+          }
+          continue;
+        }
+        // Link outExp to inExp.
+        if (inExp->bHoist) {
+          sortMap[outExp].Preds.insert(inExp);
+          sortMap[inExp].Succs.insert(outExp);
+        } else {
+          sortMap[inExp].Preds.insert(outExp);
+          sortMap[outExp].Succs.insert(inExp);
+        }
+      }
+    }
+  }
+
+  if (sortMap.empty())
+    return;
+
+  SmallVector<SubExp *, 8> WorkList;
+  for (SubExp &Exp : subExpCandidates) {
+    SortNode &Node = sortMap[&Exp];
+    Node.Depth = 0;
+    Node.Exp = Exp;
+    Node.bDepthDirty = !Node.Preds.empty();
+    if (!Node.bDepthDirty)
+      WorkList.emplace_back(&Exp);
+  }
+  // Calc depth.
+  while (!WorkList.empty()) {
+    SubExp *Exp = WorkList.pop_back_val();
+    SortNode &Node = sortMap[Exp];
+    for (SubExp *Succ : Node.Succs) {
+      SortNode &SuccNode = sortMap[Succ];
+      SuccNode.Depth = std::max(SuccNode.Depth, Node.Depth + 1);
+      bool bAllPrevClean = true;
+      for (SubExp *Prev : SuccNode.Preds) {
+        SortNode &PrevNode = sortMap[Prev];
+        if (PrevNode.bDepthDirty) {
+          bAllPrevClean = false;
+          break;
+        }
+      }
+      if (bAllPrevClean) {
+        SuccNode.bDepthDirty = false;
+        WorkList.push_back(Succ);
+      }
+    }
+  }
+
+  std::vector<SortNode *> nodes;
+  for (auto &it : sortMap) {
+    SortNode &node = it.second;
+    nodes.emplace_back(&node);
+  }
+
+  struct sorter {
+    bool operator()(const SortNode *a, const SortNode *b) {
+      return a->Depth > b->Depth;
+    }
+  };
+
+  // subExp deeper should be apply first.
+  std::sort(nodes.begin(), nodes.end(), sorter());
+
+  subExpCandidates.clear();
+  for (auto &node : nodes) {
+    subExpCandidates.emplace_back(node->Exp);
+  }
+}
+
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
+                    unsigned maxS1, const GCNSubtarget *ST) {
+  unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
+  unsigned VTgtOcc1 = ST->getOccupancyWithNumVGPRs(maxV1);
+  unsigned STgtOcc0 = ST->getOccupancyWithNumSGPRs(maxS0);
+  unsigned STgtOcc1 = ST->getOccupancyWithNumSGPRs(maxS1);
+  unsigned Occ0 = std::min(VTgtOcc0, STgtOcc0);
+  unsigned Occ1 = std::min(VTgtOcc1, STgtOcc1);
+  // big occupancy is low pressure.
+  if (Occ0 > Occ1)
+    return false;
+  if (Occ0 < Occ1)
+    return true;
+  // When sgpr bound, big sgpr is high pressure.
+  if (VTgtOcc0 > STgtOcc0 && VTgtOcc1 > STgtOcc1) {
+    return maxS0 > maxS1;
+  }
+  // When vgpr bound or mix, vgpr higher is higher pressure.
+  return maxV0 > maxV1;
+}
+
+// Return true if the subExp can help pressure for passThrus.
+bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+                     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+                     MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+  LLVM_DEBUG(subExp.dump(MRI, SIRI));
+  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
+    return false;
+
+  // Update input size to ignore lives in which already in
+  // passThrus.
+  for (auto it : subExp.inputLive) {
+    unsigned Reg = it.first;
+    if (passThrus.count(Reg) == 0)
+      continue;
+    unsigned Size = getRegSize(Reg, it.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg)) {
+      subExp.vInputSize -= Size;
+    } else {
+      subExp.sInputSize -= Size;
+    }
+  }
+
+  if (subExp.vInputSize > subExp.vOutputSize)
+    return false;
+
+  if (subExp.sInputSize > subExp.sOutputSize && bSgprBound)
+    return false;
+
+  if (subExp.sInputSize >= subExp.sOutputSize &&
+      subExp.vInputSize == subExp.vOutputSize)
+    return false;
+
+  // Try to find a Insert Block.
+  // Skip multi def output sub exp.
+  // Collect user blocks, find common dom.
+  BlockSet userBlocks;
+  for (unsigned Reg : subExp.BottomRegs) {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      MachineBasicBlock *UserBB = UseMI.getParent();
+      // Skip current BB.
+      if (UserBB != subExp.FromBB)
+        userBlocks.insert(UserBB);
+    }
+  }
+  if (userBlocks.empty())
+    return false;
+  MachineBasicBlock *userBlock = nearest_common_dominator(pDT, userBlocks);
+  if (!pDT->dominates(subExp.FromBB, userBlock)) {
+    return false;
+  }
+  if (userBlock == subExp.FromBB &&
+      // When allow clone, could go clone path if cannot move subExp.
+      !bCanClone)
+    return false;
+
+  subExp.ToBB = userBlock;
+  if (auto *toLoop = MLI->getLoopFor(userBlock)) {
+    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+      subExp.bMoveIntoLoop = true;
+  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+    auto *toLoop = MLI->getLoopFor(userBlock);
+    // not safe to move out of loop.
+    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+        toLoop != fromLoop)
+      return false;
+  }
+  return true;
+}
+
+bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
+                              const SIRegisterInfo *SIRI,
+                              const SIInstrInfo *SIII,
+                              const MachineLoopInfo *MLI, bool bSgprBound) {
+  if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ true))
+    return false;
+  if (subExp.vInputSize < subExp.vOutputSize)
+    return false;
+  if (subExp.sInputSize < subExp.sOutputSize && bSgprBound)
+    return false;
+
+  if (subExp.sInputSize <= subExp.sOutputSize &&
+      subExp.vInputSize == subExp.vOutputSize)
+    return false;
+
+  // Try to find a Insert Block.
+  // Skip multi def output sub exp.
+  // Collect user blocks, find common dom.
+  BlockSet defBlocks;
+  for (unsigned Reg : subExp.TopRegs) {
+    MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+    if (!DefMI)
+      continue;
+    defBlocks.insert(DefMI->getParent());
+  }
+  if (defBlocks.size() != 1)
+    return false;
+  MachineBasicBlock *defBlock = *defBlocks.begin();
+  subExp.ToBB = defBlock;
+  // Not do same block hoist.
+  if (subExp.ToBB == subExp.FromBB)
+    return false;
+
+  if (auto *toLoop = MLI->getLoopFor(defBlock)) {
+    auto *fromLoop = MLI->getLoopFor(subExp.FromBB);
+    // TODO: enable move into loop when hoist.
+    if (!fromLoop || fromLoop->getLoopDepth() < toLoop->getLoopDepth())
+      return false;
+  } else if (auto *fromLoop = MLI->getLoopFor(subExp.FromBB)) {
+    auto *toLoop = MLI->getLoopFor(defBlock);
+    // not safe to move out of loop.
+    if (!toLoop || fromLoop->getLoopDepth() > toLoop->getLoopDepth() ||
+        toLoop != fromLoop)
+      return false;
+  }
+  return true;
+}
+
+SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+groupPassThruByDefBlock(Remat *Remat,
+                        const GCNRPTracker::LiveRegSet &passThrus,
+                        GCNRPTracker::LiveRegSet &usedPassThrus,
+                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                        const SIInstrInfo *SIII) {
+  MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> Candidates;
+
+  // Group safe candidates by define block.
+  for (auto it : passThrus) {
+    unsigned Reg = it.first;
+    // Skip used pass thru reg to avoid count it twice for different hot block.
+    if (usedPassThrus.count(Reg))
+      continue;
+    LLVM_DEBUG(print_vreg(Reg, MRI));
+    LLVM_DEBUG(if (SIRI->isSGPRReg(MRI, Reg)) dbgs() << " sgpr ";
+               else dbgs() << " vgpr ";);
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true)) {
+      LLVM_DEBUG(dbgs() << " is not safe\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << " is safe\n");
+    // DefMI is already checked in isSafeCandidate.
+    MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+
+    GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
+    DefInMBB[Reg] = it.second;
+  }
+  
+  llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
+
+  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
+                                                              : result) {
+    MachineBasicBlock *MBB = it.first;
+    auto &defInMBB = it.second;
+    MBB->dump();
+    llvm::dumpLiveSet(defInMBB, SIRI);
+  } llvm::dbgs() << "end of candidates\n";);
+
+  std::sort(result.begin(), result.end(),
+            [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
+               std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it1) {
+              return it0.first->getNumber() < it1.first->getNumber();
+            });
+
+  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
+                                                              : result) {
+    MachineBasicBlock *MBB = it.first;
+    auto &defInMBB = it.second;
+    MBB->dump();
+    llvm::dumpLiveSet(defInMBB, SIRI);
+  } llvm::dbgs() << "end of candidates\n";);
+
+  return result;
+}
+
+// collect pass thru regs of MBB.
+GCNRPTracker::LiveRegSet
+collectPassThrus(MachineBasicBlock *MBB,
+                 const GCNRPTracker::LiveRegSet &inputLive,
+                 const GCNRPTracker::LiveRegSet &outputLive,
+                 const GCNRPTracker::LiveRegSet &usedPassThrus,
+                 const GCNRPTracker::LiveRegSet &liveRegCandidates,
+                 MachineRegisterInfo &MRI, bool bCanClone) {
+  GCNRPTracker::LiveRegSet passThrus;
+  llvm::mergeLiveRegSet(passThrus, inputLive);
+  llvm::andLiveRegSet(passThrus, outputLive);
+
+  // Remove reg which not in liveRegCandidates.
+  GCNRPTracker::LiveRegSet tmpPassThrus = passThrus;
+  for (auto it : tmpPassThrus) {
+    unsigned Reg = it.first;
+    if (!liveRegCandidates.count(Reg)) {
+      passThrus.erase(Reg);
+    }
+  }
+  tmpPassThrus = passThrus;
+  // Remove reg which has read/write in MBB.
+  for (auto it : tmpPassThrus) {
+    unsigned Reg = it.first;
+    DenseSet<MachineBasicBlock *> DefMBBs;
+    for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+      MachineBasicBlock *MBB = DefMI.getParent();
+      DefMBBs.insert(MBB);
+    }
+    DenseSet<MachineBasicBlock *> UseMBBs;
+    // Allow use for pass thru if clone is OK.
+    if (!bCanClone) {
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        MachineBasicBlock *UserMBB = UseMI.getParent();
+        UseMBBs.insert(UserMBB);
+      }
+    }
+    bool bW = DefMBBs.count(MBB) > 0;
+    bool bR = UseMBBs.count(MBB) > 0;
+
+    bool bPassThru = !bW && !bR;
+    if (!bPassThru)
+      passThrus.erase(Reg);
+  }
+  return passThrus;
+}
+// Try to build a free subExp which all input is passThrus.
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+                       MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  SubExp freeExp;
+  // Try to split the subExp to find a help case.
+  // Scan all inst in subExp, propagate free inst which input is from
+  // passThrus.
+  SmallDenseSet<unsigned, 4> freeRegs;
+  SmallDenseSet<unsigned, 8> freeInstUseRegs;
+  SmallVector<MachineInstr *, 4> freeInsts;
+  for (MachineInstr *MI : subExp.SUnits) {
+    bool bIsFree = true;
+    // Check all use regs are free.
+    for (MachineOperand &MO : MI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isImplicit() && Reg == AMDGPU::EXEC)
+        continue;
+      if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+        bIsFree = false;
+        break;
+      }
+      // Skip local pass thrus unless it is free.
+      if (passThrus.count(Reg) && subExp.TopRegs.count(Reg))
+        continue;
+      if (freeRegs.count(Reg))
+        continue;
+      bIsFree = false;
+      break;
+    }
+    // Check def is unique.
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      if (MRI.getUniqueVRegDef(Reg) == nullptr) {
+        bIsFree = false;
+        break;
+      }
+    }
+    if (!bIsFree)
+      continue;
+    // Save inst as free inst.
+    freeInsts.emplace_back(MI);
+    // Save def as free reg.
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      freeRegs.insert(Reg);
+    }
+    // Save use regs as free use reg.
+    for (MachineOperand &MO : MI->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      freeInstUseRegs.insert(Reg);
+    }
+  }
+  // Then remove local inst has no output use.
+  for (MachineInstr *MI : freeInsts) {
+    bool bIsFreeUsed = false;
+    for (MachineOperand &MO : MI->defs()) {
+      unsigned Reg = MO.getReg();
+      // Used as freeInst or output.
+      bIsFreeUsed |=
+          freeInstUseRegs.count(Reg) > 0 || subExp.BottomRegs.count(Reg);
+    }
+    if (!bIsFreeUsed)
+      continue;
+    freeExp.SUnits.emplace_back(MI);
+  }
+  if (freeExp.SUnits.empty()) {
+    // mark has terminator to make it unsafe.
+    freeExp.bHasTerminatorInst = true;
+    return freeExp;
+  }
+  // Build BottomRegs and TopRegs for freeExp.
+  // BottomRegs is freeRegs in subExp.BottomRegs.
+  for (unsigned freeReg : freeRegs) {
+    if (subExp.BottomRegs.count(freeReg))
+      freeExp.BottomRegs.insert(freeReg);
+  }
+  // TopRegs is freeInstUseRegs in subExp.TopRegs.
+  for (unsigned freeInstUseReg : freeInstUseRegs) {
+    if (subExp.TopRegs.count(freeInstUseReg))
+      freeExp.TopRegs.insert(freeInstUseReg);
+  }
+  freeExp.FromBB = subExp.FromBB;
+  freeExp.ToBB = subExp.ToBB;
+  // must be clone since is partial of subExp.
+  freeExp.bCloneOnly = true;
+
+  // Calc reg for freeExp.
+  for (unsigned Reg : freeExp.TopRegs) {
+    freeExp.inputLive[Reg];
+  }
+
+  for (unsigned Reg : freeExp.BottomRegs) {
+    freeExp.outputLive[Reg];
+  }
+
+  CollectLiveSetPressure(freeExp.inputLive, MRI, SIRI, freeExp.vInputSize,
+                         freeExp.sInputSize);
+  CollectLiveSetPressure(freeExp.outputLive, MRI, SIRI, freeExp.vOutputSize,
+                         freeExp.sOutputSize);
+  return freeExp;
+}
+
+std::vector<SubExp> buildSubExpCandidates(
+    Remat *Remat,
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        &Candidates,
+    GCNRPTracker::LiveRegSet &passThrus, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+    const MachineLoopInfo *MLI, SlotIndexes *slotIndexes,
+    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus,
+    DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
+    bool bAllowPartialUseInSubExp) {
+  std::vector<SubExp> subExpCandidates;
+  // Build exp dag on define blocks.
+  // Save profit candidates into list.
+  for (auto &it : Candidates) {
+    MachineBasicBlock *DefMBB = it.first;
+    // Try to remove out reg def sub exp from DefMBB.
+    GCNRPTracker::LiveRegSet &DefInMBB = it.second;
+    // Go up on the dag until reach share node.
+    auto subExps =
+        buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
+                                  slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+    for (SubExp &subExp : subExps) {
+      if (subExp.bHasMemInst) {
+        // Skip when memory ld/st inst need to cross MBB which write memory.
+        // TODO: check all MBBs in between FromBB and ToBB not write memory.
+        // Currently just skip when any memory write exist.
+        if (!MemWriteMBBSet.empty()) {
+          MachineBasicBlock *FromBB = subExp.FromBB;
+          MachineBasicBlock *ToBB = subExp.ToBB;
+          if (subExp.bHoist) {
+            FromBB = subExp.ToBB;
+            ToBB = subExp.FromBB;
+          }
+          bool bCrossMemWriteMBB = false;
+          for (MachineBasicBlock *MemMBB : MemWriteMBBSet) {
+            if (pDT->dominates(ToBB, MemMBB))
+              continue;
+            if (pDT->dominates(MemMBB, FromBB))
+              continue;
+            bCrossMemWriteMBB = true;
+            break;
+          }
+          if (bCrossMemWriteMBB)
+            continue;
+        }
+      }
+      if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+                           bCanClone, bSgprBound)) {
+        if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+          SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
+                              bCanClone, bSgprBound)) {
+            subExpCandidates.emplace_back(freeSubExp);
+          }
+        }
+        continue;
+      }
+
+      subExpCandidates.emplace_back(subExp);
+    }
+  }
+  return subExpCandidates;
+}
+
+std::pair<int, int>
+calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
+                GCNRPTracker::LiveRegSet &inputLive,
+                GCNRPTracker::LiveRegSet &outputLive, bool bVOutBound,
+                bool bSOutBound, bool bCanClone, MachineDominatorTree *pDT,
+                const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  int vgpr = 0;
+  int sgpr = 0;
+  MachineBasicBlock *MBB = hotBB.MBB;
+  // Sink saving.
+  for (SubExp &Exp : subExpCandidates) {
+    if (Exp.bHoist) {
+      // ToMBB -> MBB -> FromMBB.
+      // If ToMBB not dom hot block, reg will not live in MBB.
+      if (!pDT->dominates(Exp.ToBB, MBB))
+        continue;
+    } else {
+      // If FromBB not dom hot block, reg will not live in MBB.
+      if (!pDT->dominates(Exp.FromBB, MBB))
+        continue;
+      // When subExp is from hotBB, check output instead of input.
+      if (Exp.FromBB == MBB) {
+        if (bVOutBound && Exp.vOutputSize < Exp.vInputSize)
+          continue;
+        if (bSOutBound && Exp.sOutputSize < Exp.sInputSize)
+          continue;
+        vgpr += Exp.vInputSize;
+        vgpr -= Exp.vOutputSize;
+        sgpr += Exp.sInputSize;
+        sgpr -= Exp.sOutputSize;
+        continue;
+      }
+    }
+    int vgprDiff = 0;
+    int sgprDiff = 0;
+    MachineBasicBlock *ToMBB = Exp.ToBB;
+    // If subExp is to hotBB, it is crossing output instead of input.
+    GCNRPTracker::LiveRegSet &crossLive = MBB == ToMBB ? outputLive : inputLive;
+
+    bool bClone = false;
+    GCNRPTracker::LiveRegSet newInput;
+    if (!Exp.bMoveIntoLoop) {
+      if (Exp.bHoist) {
+        // If FromBB dom hot block, it will not change live for MBB.
+        if (Exp.FromBB != MBB && pDT->dominates(Exp.FromBB, MBB))
+          continue;
+      } else {
+        // If ToBB dom hot block, it will not change live for MBB.
+        if (ToMBB != MBB && pDT->dominates(ToMBB, MBB)) {
+          if (bCanClone && !Exp.bNotSafeToCopy) {
+            bClone = true;
+          } else {
+            continue;
+          }
+        }
+      }
+
+      for (auto outIt : Exp.outputLive) {
+        unsigned Reg = outIt.first;
+        LaneBitmask outMask = outIt.second;
+        LaneBitmask MBBBeginMask;
+        if (crossLive.find(Reg) != crossLive.end())
+          MBBBeginMask = crossLive[Reg];
+        // Check mask which live in both BeginSlot and exp output when sink to
+        // kill the output. Check mask which not live in BeginSlot but live in
+        // exp output when hoist to live the output.
+        LaneBitmask profitMask =
+            Exp.bHoist ? (outMask & (~MBBBeginMask)) : (outMask & MBBBeginMask);
+        if (MBBBeginMask.any()) {
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          LLVM_DEBUG(std::string movStr =
+                         Exp.bHoist ? "output hoist:" : "output sink:";
+                     dbgs() << movStr << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          // Exp out live at block input.
+          // It will descrease live for MBB when sink and increase when hoist.
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            if (Exp.bHoist)
+              vgprDiff += Size;
+            else
+              vgprDiff -= Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            if (Exp.bHoist)
+              sgprDiff += Size;
+            else
+              sgprDiff -= Size;
+          }
+        }
+      }
+
+      for (auto inIt : Exp.inputLive) {
+        unsigned Reg = inIt.first;
+        LaneBitmask inMask = inIt.second;
+        LaneBitmask MBBBeginMask;
+        if (crossLive.find(Reg) != crossLive.end())
+          MBBBeginMask = crossLive[Reg];
+        // Check mask which not live in BeginSlot but live in exp input when
+        // sink to live the input. Check mask which live in both BeginSlot and
+        // exp output when hoist to kill the input.
+        LaneBitmask profitMask =
+            Exp.bHoist ? (inMask & MBBBeginMask) : (inMask & (~MBBBeginMask));
+        if (profitMask.any()) {
+          // Update input live to avoid count same input more than once.
+          newInput[Reg] |= inMask;
+          // Exp in not live at block input.
+          // It will increase live for MBB.
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+          LLVM_DEBUG(std::string movStr =
+                         Exp.bHoist ? "input hoist:" : "input sink:";
+                     dbgs() << movStr << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            if (Exp.bHoist)
+              vgprDiff -= Size;
+            else
+              vgprDiff += Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            if (Exp.bHoist)
+              sgprDiff -= Size;
+            else
+              sgprDiff += Size;
+          }
+        }
+      }
+    } else {
+      // When sink into loop, the input will live for every block inside loop.
+      // The output will only lived between to blocks and the use blocks.
+      // If MBB dominate any user of output live reg, it will still live in
+      // MBB. So cannot count that output live reg as profit.
+      // Hoist into loop is not supported now.
+      for (auto outIt : Exp.outputLive) {
+        unsigned Reg = outIt.first;
+        bool bDomUser = false;
+        for (MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
+          MachineBasicBlock *UserMBB = MI.getParent();
+          if (pDT->dominates(MBB, UserMBB)) {
+            bDomUser = true;
+            break;
+          }
+        }
+        if (bDomUser)
+          continue;
+
+        LaneBitmask outMask = outIt.second;
+        LaneBitmask MBBBeginMask;
+        if (inputLive.find(Reg) != inputLive.end())
+          MBBBeginMask = inputLive[Reg];
+        LaneBitmask profitMask = outMask & MBBBeginMask;
+        if (MBBBeginMask.any()) {
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+          LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          // Exp out live at block input.
+          // It will descrease live for MBB.
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            vgprDiff -= Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            sgprDiff -= Size;
+          }
+        }
+      }
+
+      for (auto inIt : Exp.inputLive) {
+        unsigned Reg = inIt.first;
+        LaneBitmask inMask = inIt.second;
+        LaneBitmask MBBBeginMask;
+        if (inputLive.find(Reg) != inputLive.end())
+          MBBBeginMask = inputLive[Reg];
+        // Check mask which not live in BeginSlot but live in exp input.
+        LaneBitmask profitMask = inMask & (~MBBBeginMask);
+        if (profitMask.any()) {
+          // Update input live to avoid count same input more than once.
+          newInput[Reg] |= inMask;
+          // Exp in not live at block input.
+          // It will increase live for MBB.
+          unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
+
+          LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
+                            << " " << Size);
+          if (SIRI->isVGPR(MRI, Reg)) {
+            LLVM_DEBUG(dbgs() << "v\n");
+            vgprDiff += Size;
+          } else {
+            LLVM_DEBUG(dbgs() << "s\n");
+            sgprDiff += Size;
+          }
+        }
+      }
+    }
+
+    if (bVOutBound && vgprDiff > 0)
+      continue;
+
+    if (bSOutBound && sgprDiff > 0)
+      continue;
+    llvm::mergeLiveRegSet(crossLive, newInput);
+    vgpr += vgprDiff;
+    sgpr += sgprDiff;
+    if (bClone)
+      Exp.bCloneOnly = true;
+  }
+
+  return std::make_pair(vgpr, sgpr);
+}
+
+void addExpCandidates(std::vector<SubExp> &subExpCandidates,
+                      std::vector<SubExp> &subExps,
+                      GCNRPTracker::LiveRegSet &usedRegs) {
+  subExpCandidates.insert(subExpCandidates.end(), subExps.begin(),
+                          subExps.end());
+  for (auto &Exp : subExps) {
+    if (Exp.bHoist) {
+      for (auto &Reg : Exp.TopRegs) {
+        usedRegs[Reg];
+      }
+    } else {
+      for (auto &Reg : Exp.BottomRegs) {
+        usedRegs[Reg];
+      }
+    }
+  }
+}
+
+bool tryToAddSubExps(
+    Remat *Remat,
+    HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+    std::vector<SubExp> &inBlockCloneSubExps,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
+    DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        Candidates,
+    int vgpr, int sgpr, const GCNRPTracker::LiveRegSet &savingInputLive,
+    const GCNRPTracker::LiveRegSet &savingOutputLive,
+    GCNRPTracker::LiveRegSet &passThrus, GCNRPTracker::LiveRegSet &usedRegs,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+    SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
+    bool bCanClone, bool bVOutBound, bool bSOutBound,
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
+  std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
+      Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
+      bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+      bAllowPartialUseInSubExp);
+
+  GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
+  GCNRPTracker::LiveRegSet tmpSavingOutputLive = savingOutputLive;
+  std::pair<int, int> curSaving = calculateSaving(
+      hotBB, partialSubExps, tmpSavingInputLive, tmpSavingOutputLive,
+      bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+  const int VLimit = status.TargetVLimit;
+  const int SLimit = status.TargetSLimit;
+
+  vgpr += curSaving.first;
+  sgpr += curSaving.second;
+
+  if (vgpr <= VLimit && sgpr <= SLimit) {
+    // nrmSubExps can help reach target occupancy, add it to
+    // subExpCandidates.
+    addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+    return true;
+  }
+
+  if (EnableSubExpAggressive) {
+    // Build candidates from passThrus but not used in partialSubExps.
+    GCNRPTracker::LiveRegSet sinkUsedRegs;
+    for (auto &Exp : partialSubExps) {
+      for (auto &Reg : Exp.BottomRegs) {
+        sinkUsedRegs[Reg];
+      }
+    }
+    MapVector<MachineBasicBlock *, GCNRPTracker::LiveRegSet> HoistCandidates;
+    for (auto &it : hotBB.inputLive) {
+      unsigned Reg = it.first;
+      // Skip reg which already used for sink exp.
+      if (sinkUsedRegs.count(Reg))
+        continue;
+      if (usedRegs.count(Reg))
+        continue;
+      // Skip unsafe reg.
+      if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ false)) {
+        LLVM_DEBUG(dbgs() << " is not safe to hoist\n");
+        continue;
+      }
+      // DefMI is already checked in isSafeCandidate.
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+      MachineBasicBlock *DefMBB = DefMI->getParent();
+      DenseSet<MachineBasicBlock *> UseMBBSet;
+      // Make sure all uses not in Def block are in same block.
+      for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+        MachineBasicBlock *UseMBB = UseMI.getParent();
+        if (UseMBB == DefMBB)
+          continue;
+        UseMBBSet.insert(UseMBB);
+      }
+
+      if (UseMBBSet.size() != 1)
+        continue;
+      MachineBasicBlock *UseMBB = *UseMBBSet.begin();
+      GCNRPTracker::LiveRegSet &UseInMBB = HoistCandidates[UseMBB];
+      UseInMBB[Reg] = getRegMask(DefMI->getOperand(0), MRI);
+    }
+
+    SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+    // Build exp dag on define blocks.
+    std::vector<SubExp> hoistSubExpCandidates;
+    // Save profit candidates into list.
+    for (auto it : HoistCandidates) {
+      MachineBasicBlock *UseMBB = it.first;
+      // Try to remove out reg def sub exp from DefMBB.
+      GCNRPTracker::LiveRegSet &UseInMBB = it.second;
+      // Go up on the dag until reach share node.
+      auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
+                                                        SIII, MRI, slotIndexes);
+      for (SubExp &subExp : subExps) {
+        if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
+          continue;
+        subExp.bHoist = true;
+        hoistSubExpCandidates.emplace_back(subExp);
+      }
+    }
+
+    std::pair<int, int> hoistSaving = calculateSaving(
+        hotBB, hoistSubExpCandidates, tmpSavingInputLive, tmpSavingOutputLive,
+        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+    int hoistVgpr = vgpr + hoistSaving.first;
+    int hoistSgpr = sgpr + hoistSaving.second;
+
+    if ((hoistVgpr <= VLimit && hoistSgpr <= SLimit) ||
+        // If status not balance, do the remat even cannot reach target.
+        // TODO: check the result not help even one occupancy.
+        (!hoistSubExpCandidates.empty() && !status.bNotBalance &&
+         TargetOccupancy != 0)) {
+      // nrmSubExps can help reach target occupancy, add it to
+      // subExpCandidates.
+      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+      addExpCandidates(subExpCandidates, hoistSubExpCandidates, usedRegs);
+
+      return true;
+    }
+  }
+
+  if (EnableVmemDegree &&
+      // Only expect vmem when last tryToAddSubExps.
+      // If not, bAllowPartialUseInSubExp will no chance to be true.
+      (bAllowPartialUseInSubExp ||
+       !EnableSubExpAggressive)) {
+    // Assume vmemLdSize could be optimized by not parallel.
+    if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
+         (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
+        sgpr <= SLimit) {
+      // nrmSubExps can help reach target occupancy, add it to
+      // subExpCandidates.
+      addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+      return true;
+    }
+  }
+
+  int vDistance = vgpr - (int)VLimit;
+  int sDistance = status.TargetOcc > 4 ? (sgpr - (int)SLimit) : 0;
+  int vSaved = hotBB.maxPressures.first - vgpr;
+  int sSaved = hotBB.maxPressures.second - sgpr;
+  // Try to add inBlockCloneSubExps.
+  if (!tryRematInHotSpot(*hotBB.MBB, status, vDistance, sDistance, vSaved,
+                         sSaved, inBlockCloneSubExps, inBlockHotVInstMap,
+                         inBlockHotSInstMap, LIS, MRI, SIRI, SIII)) {
+    // return false always when not allow partialUseInSubExp, it will try again
+    // with partialUseInSubExp enabled.
+    if (!bAllowPartialUseInSubExp)
+      return false;
+    // If status not balance, do the remat even cannot reach target.
+    // TODO: check the result not help even one occupancy.
+    if (!status.bNotBalance && TargetOccupancy == 0)
+      return false;
+  }
+  // nrmSubExps can help reach target occupancy, add it to
+  // subExpCandidates.
+  addExpCandidates(subExpCandidates, partialSubExps, usedRegs);
+  return true;
+}
+
+// Remat passthru regs per hot block.
+// Reason to do it per block is to make sure passthru reuse is precise.
+// If try remat on all hot blocks together, the passthru might be on one block,
+// but the reuse in on another block which the reg is not passthru there.
+bool perBlockPassthruRemat(Remat *Remat,
+                           std::vector<HotBlock> &hotBlocks,
+                           RematStatus &status,
+                           GCNRPTracker::LiveRegSet &liveRegCandidates,
+                           const GCNSubtarget *ST, LiveIntervals *LIS,
+                           const MachineLoopInfo *MLI,
+                           MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
+                           const SIRegisterInfo *SIRI,
+                           const SIInstrInfo *SIII) {
+  bool bUpdated = false;
+  bool bCanClone = EnableSubExpClone |
+                   EnableSubExpAggressive;
+
+  SlotIndexes *slotIndexes = LIS->getSlotIndexes();
+  // Sort hot blocks by pressure first.
+  // The hot block with higher pressure is easier to fail.
+  // If fail, fail fast. It it works, save the subExpCandidates. The
+  // subExpCandidates may help other hotblocks.
+  std::sort(hotBlocks.begin(), hotBlocks.end(),
+            [&ST](const HotBlock &a, const HotBlock &b) {
+              return pressureHigher(a.maxPressures.first, a.maxPressures.second,
+                                    b.maxPressures.first, b.maxPressures.second,
+                                    ST);
+            });
+
+  std::vector<SubExp> subExpCandidates;
+  // For inBlock remat clone.
+  std::vector<SubExp> inBlockCloneSubExps;
+  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotVInstMap;
+  DenseMap<MachineBasicBlock *, MachineInstr *> inBlockHotSInstMap;
+
+  // Save used passThrus to avoid use same reg on different MBB.
+  GCNRPTracker::LiveRegSet usedPassThrus;
+  // Save moved regs to avoid use same reg hoist and sink.
+  GCNRPTracker::LiveRegSet usedRegs;
+
+  const int VLimit = status.TargetVLimit;
+  const int SLimit = status.TargetSLimit;
+  // Collect passthru for hot block.
+  // Try remat on it.
+  for (auto &it : hotBlocks) {
+    MachineBasicBlock *MBB = it.MBB;
+
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+    it.inputLive = inputLive;
+
+    // Add pressure by 1 to consider spill to vgpr.
+    const int PressureDelta = -1;
+    int vgpr = it.maxPressures.first - PressureDelta;
+    int sgpr = it.maxPressures.second;
+    bool bVOutBound = vgpr > VLimit;
+    bool bSOutBound = sgpr > SLimit;
+    // savingInputLive is used to calculate saving which will be modified to
+    // avoid count same input multiple times.
+    GCNRPTracker::LiveRegSet savingInputLive = inputLive;
+    GCNRPTracker::LiveRegSet savingOutputLive = outputLive;
+    std::pair<int, int> curSaving =
+        calculateSaving(it, subExpCandidates, savingInputLive, savingOutputLive,
+                        bVOutBound, bSOutBound, bCanClone, pDT, MRI, SIRI);
+
+    vgpr += curSaving.first;
+    sgpr += curSaving.second;
+
+    if (vgpr <= VLimit && sgpr <= SLimit)
+      continue;
+
+    // Collect pass thru regs.
+    GCNRPTracker::LiveRegSet passThrus =
+        collectPassThrus(MBB, inputLive, outputLive, usedPassThrus,
+                         liveRegCandidates, MRI, bCanClone);
+
+    // Group pass thru regs by def MBB.
+    SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+        Candidates =
+        groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+    // unUsedPassThrus used to collect passThru which is skipped when build
+    // subExp.
+    GCNRPTracker::LiveRegSet unusedPassThrus;
+    // Build exp dag on define blocks.
+    bool bAllowPartialUseInSubExp = false;
+    if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+                        inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+                        vgpr, sgpr, savingInputLive, savingOutputLive,
+                        passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+                        LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+                        unusedPassThrus, bAllowPartialUseInSubExp)) {
+      // Remove unusedPassThrus from passThrus first.
+      llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
+      llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+      continue;
+    }
+    // If cannot clone, don't need to try partialUseInSubExp which must clone.
+    if (!bCanClone)
+      return false;
+
+    // Partial use subExp may result big alu count caused by clone.
+    // Only try it when enable aggressive remat.
+    if (!EnableSubExpAggressive)
+      return false;
+
+    bAllowPartialUseInSubExp = true;
+    if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+                         inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
+                         vgpr, sgpr, savingInputLive, savingOutputLive,
+                         passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
+                         LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+                         unusedPassThrus, bAllowPartialUseInSubExp)) {
+      return false;
+    }
+    // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
+    llvm::mergeLiveRegSet(usedPassThrus, passThrus);
+  }
+
+  // Apply changes.
+  {
+    // sort subExpCandidates to make sure input use apply before output use if a
+    // reg is input and output of subExps.
+    LLVM_DEBUG(for (SubExp &Exp : subExpCandidates) { Exp.dump(MRI, SIRI); });
+    sortSubExpCandidates(subExpCandidates);
+
+    for (SubExp &Exp : subExpCandidates) {
+      // Skip exp which is cleared in sort for hoist sink conflict.
+      if (Exp.SUnits.empty())
+        continue;
+      LLVM_DEBUG(Exp.dump(MRI, SIRI));
+      if (Exp.bHoist) {
+        ApplySubExpMoveNearDefine(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+      } else {
+        if (Exp.bCloneOnly)
+          ApplySubExpCloneNearUser(Exp, hotBlocks, pDT, MRI, slotIndexes, SIII,
+                                   SIRI);
+        else
+          ApplySubExpMoveNearUser(Exp, MRI, pDT, slotIndexes, SIII, SIRI);
+      }
+    }
+
+    for (SubExp &Exp : inBlockCloneSubExps) {
+      ApplySubExpCloneNearUserInBlock(Exp, inBlockHotVInstMap,
+                                      inBlockHotSInstMap, MRI, slotIndexes,
+                                      SIII, SIRI);
+    }
+    // Try to see possible occupancy could reach, then dicide a target.
+    // Apply remat.
+    bUpdated = subExpCandidates.size();
+  }
+
+  return bUpdated;
+}
+
+int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+                  const SIRegisterInfo *SIRI, const MachineRegisterInfo &MRI) {
+  int vmemLdSize = 0;
+  // Collect vmemLd when enable split.
+  for (MachineInstr &MI : MBB) {
+    bool bIsHighLatency = SIII->isHighLatencyInstruction(MI);
+    if (!bIsHighLatency)
+      continue;
+    if (!(MI.mayLoad() &&
+          // Skip case like atomic which not return value.
+          MI.getNumDefs() > 0))
+      continue;
+    // a vmem ld.
+    MachineOperand &Dst = MI.getOperand(0);
+    LaneBitmask mask = llvm::getRegMask(Dst, MRI);
+    unsigned size = llvm::getRegSize(Dst.getReg(), mask, MRI, SIRI);
+    vmemLdSize += size;
+  }
+  return vmemLdSize;
+}
+
+} // namespace
+
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
+                MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
+                AliasAnalysis *AA)
+{
+  if (MF.size() < 2)
+    return false;
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+
+  auto &MRI = MF.getRegInfo();
+
+  RematStatus status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+
+  const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
+  if (status.TargetOcc >= MaxOcc)
+    return false;
+
+  unsigned VLimit = status.TargetVLimit;
+  unsigned SLimit = status.TargetSLimit;
+
+  int rematVCnt = status.MaxVPressure - VLimit;
+  int rematSCnt = status.MaxSPressure - SLimit;
+
+  bool bSGPRSpill = false;
+  if (rematSCnt > 0) {
+    bSGPRSpill = nearSgprSpill(status.MaxSPressure, ST, MF);
+  }
+
+  // If bound by lds, skip.
+  if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+      !bSGPRSpill)
+    return false;
+
+  bool bBothOutLimit = rematVCnt > 0 && rematSCnt > 0;
+  // TODO: use check wqm and support vreg remat.
+  bool bCheckWQM = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  rematVCnt = bCheckWQM & false;
+
+  // Remat on every hot block.
+
+  // Collect all hot blocks.
+  std::vector<HotBlock> hotBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    // Collect reg pressure.
+    auto &RP = status.MBBPressureMap[&MBB];
+    unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+    unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+    maxLocalSPressure += RegForVCC;
+
+    if (!EnableInBlockRemat) {
+      if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+        continue;
+    }
+
+    // Move inst which input is imm/pass thru reg/out reg to help pressure.
+    if (tryHoldPacifist(MBB, LIS, MRI, SIRI, SIII, AA, status)) {
+      maxLocalVPressure = 0;
+      maxLocalSPressure = 0;
+      CollectMBBPressure(MBB, LIS, MRI, ST, maxLocalVPressure,
+                         maxLocalSPressure, status);
+
+      maxLocalSPressure += RegForVCC;
+
+    }
+    if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
+      continue;
+
+    // When both vgpr sgpr out limit, only help vgpr.
+    if (bBothOutLimit && maxLocalVPressure <= VLimit)
+      continue;
+    GCNRPTracker::LiveRegSet liveSet;
+    hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+  }
+  // Collect vmemLdInput/OutputSize.
+  if (EnableVmemDegree) {
+    DenseMap<MachineBasicBlock *, unsigned> outputVMemLdSizeMap;
+    for (auto it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+      // Collect vmemLd when enable split.
+      int vmemLdSize = getVMemLdSize(*MBB, SIII, SIRI, MRI);
+      if (vmemLdSize) {
+        outputVMemLdSizeMap[MBB] = vmemLdSize;
+      }
+    }
+    for (auto &it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+
+      auto oit = outputVMemLdSizeMap.find(MBB);
+      if (oit != outputVMemLdSizeMap.end())
+        it.vmemLdOutputSize = oit->second;
+
+      if (MBB->pred_size() != 1)
+        continue;
+
+      MachineBasicBlock *Pred = *MBB->pred_begin();
+      oit = outputVMemLdSizeMap.find(Pred);
+      if (oit != outputVMemLdSizeMap.end()) {
+        it.vmemLdInputSize = oit->second;
+      } else {
+        if (Pred->getFirstTerminator() != Pred->end())
+          continue;
+        if (Pred->empty())
+          continue;
+        bool bIsHighLatency = SIII->isHighLatencyInstruction(Pred->back());
+        if (!bIsHighLatency)
+          continue;
+        int vmemLdSize = getVMemLdSize(*Pred, SIII, SIRI, MRI);
+        it.vmemLdInputSize = vmemLdSize;
+      }
+    }
+  }
+
+  if (EnableUniformVectorToScalar) {
+    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
+                               SIRI, SIII, MLI)) {
+      // Rebuild LIS.
+      LIS->reanalyze(MF);
+      status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+      bool bSgprSpilled = nearSgprSpill(status.MaxSPressure, ST, MF);
+      if (bSgprSpilled) {
+        bool bNearTarget = false;
+        hotBlockRemat(Remat, MF, MLI, LIS, pDT, pPDT, bNearTarget);
+        // Rebuild LIS.
+        LIS->reanalyze(MF);
+        status = GetRematStatus(MF, MLI, LIS, MRI, ST);
+      }
+
+      for (auto &it : hotBlocks) {
+        MachineBasicBlock *MBB = it.MBB;
+
+        // Update pressure.
+        auto &RP = status.MBBPressureMap[MBB];
+        unsigned maxLocalVPressure = RP.getVGPRNum(ST->hasGFX90AInsts());
+        unsigned maxLocalSPressure = RP.getMaxSGPR();
+
+        maxLocalSPressure += RegForVCC;
+        it.maxPressures.first = maxLocalVPressure;
+        it.maxPressures.second = maxLocalSPressure;
+      }
+    }
+  }
+
+  // Collect all live reg which cross hot blocks.
+  GCNRPTracker::LiveRegSet liveRegCandidates;
+  for (auto it : hotBlocks) {
+    MachineBasicBlock *MBB = it.MBB;
+
+    const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[MBB];
+
+    const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[MBB];
+
+    llvm::mergeLiveRegSet(liveRegCandidates, inputLive);
+    llvm::mergeLiveRegSet(liveRegCandidates, outputLive);
+  }
+
+  // Check min VGPR bound.
+  BlockSet PressureUnderLimitSet;
+  if (EnableSubExpMinReg) {
+    for (auto &it : hotBlocks) {
+      MachineBasicBlock *MBB = it.MBB;
+      unsigned MaxLocalVGPR = 0;
+      unsigned MaxLocalSGPR = 0;
+      llvm::getRegBound(MBB, MRI, SIRI, SIII, LIS, MaxLocalVGPR, MaxLocalSGPR);
+
+      if (MaxLocalVGPR < VLimit && MaxLocalSGPR < SLimit) {
+        PressureUnderLimitSet.insert(MBB);
+      } else {
+        if (MaxLocalVGPR < it.maxPressures.first)
+          it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+        if (MaxLocalSGPR < it.maxPressures.second)
+          it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
+      }
+    }
+  }
+
+  bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
+                                        ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+
+  return bUpdated;
+}
+
+bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.size() < 2)
+    return false;
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  {
+    llvm::MirGPUDivergenceAnalysis DA(MF, *DT, *PDT, *MLI);
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (DA.isUniform(&MI)) {
+          TotalUniformInsts.insert(&MI);
+        }
+      }
+    }
+  }
+
+  //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+  // For non-cs/ps, set target occ as 4.
+  bool bNearTarget = false;
+  bool bFinalUpdated = false;
+  bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
+  bFinalUpdated |= bUpdated;
+  if (EnableSubExp) {
+    if (bUpdated) {
+      // Rebuild LIS.
+      LIS->reanalyze(MF);
+    }
+
+    bUpdated = GroupRemat(this, MF, MLI, LIS, DT, PDT, AA);
+
+    bFinalUpdated |= bUpdated;
+  }
+  return bFinalUpdated;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                      "AMDGPU rematerialize", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
+                    false, false)
+
+char AMDGPUHotBlockRematerialize::ID = 0;
+char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
+
+FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
+  return new AMDGPUHotBlockRematerialize();
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
new file mode 100644
index 000000000000000..6f44fec08239cde
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -0,0 +1,2241 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// AMDGPUMIRUtils.cpp                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Util functions for llvm MIR Passes.                                       //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-mir-util"
+using namespace llvm;
+namespace {
+class CFGWithPhi {
+public:
+  CFGWithPhi(MachineFunction &F) : F(F) {
+    // Collect phi and phi related insts.
+    MachineRegisterInfo &MRI = F.getRegInfo();
+
+    for (MachineBasicBlock &BB : F) {
+      auto &phiInsts = blockToPhiInstsMap[&BB];
+      for (MachineInstr &I : BB) {
+        if (!I.isPHI())
+          break;
+        phiInsts.insert(&I);
+        unsigned Reg = I.getOperand(0).getReg();
+        // Add incoming values.
+        for (unsigned i=1;i<I.getNumOperands();i+=2) {
+          MachineOperand &MO = I.getOperand(i);
+          if (!MO.isReg())
+            continue;
+          MachineInstr *DefMI = MRI.getUniqueVRegDef(MO.getReg());
+          if (!DefMI)
+            continue;
+          blockToPhiInstsMap[DefMI->getParent()].insert(DefMI);
+        }
+        // Add users.
+        for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+          blockToPhiInstsMap[UseMI.getParent()].insert(&UseMI);
+        }
+      }
+    }
+  } /// Adds custom features for a visualization of the ScheduleDAG.
+  void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
+  MachineFunction &F;
+  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+  void dump();
+};
+
+void CFGWithPhi::dump() {
+#ifdef DBG
+  for (MachineBasicBlock &BB : F) {
+    dbgs() << BB.getName() << "\n";
+    auto &phiInsts = blockToPhiInstsMap[&BB];
+    for (MachineInstr *I : phiInsts) {
+      if (!I->isPHI())
+        continue;
+      I->dump();
+    }
+    for (MachineInstr *I : phiInsts) {
+      if (I->isPHI())
+        continue;
+      I->dump();
+    }
+  }
+#endif
+}
+
+} // namespace
+
+// CFGWithPhi dump.
+namespace llvm {
+
+template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const CFGWithPhi *G) {
+    return "CFG with Phi graph";
+  }
+
+  static std::string getNodeIdentifierLabel(const MachineBasicBlock *Node,
+                                            const CFGWithPhi *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
+
+  static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+    enum { MaxColumns = 8000 };
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    OS << "BB:" << BB->getName();
+    auto it = G->blockToPhiInstsMap.find(BB);
+    if (it != G->blockToPhiInstsMap.end()) {
+
+      auto &phiInsts = it->second;
+      for (MachineInstr *I : phiInsts) {
+        if (!I->isPHI())
+          continue;
+        I->print(OS);
+        OS << "\n";
+      }
+      for (MachineInstr *I : phiInsts) {
+        if (I->isPHI())
+          continue;
+        I->print(OS);
+        OS << "\n";
+      }
+    }
+    std::string OutStr = OS.str();
+    if (OutStr[0] == '\n')
+      OutStr.erase(OutStr.begin());
+
+    // Process string output to make it nicer...
+    unsigned ColNum = 0;
+    unsigned LastSpace = 0;
+    for (unsigned i = 0; i != OutStr.length(); ++i) {
+      if (OutStr[i] == '\n') { // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin() + i + 1, 'l');
+        ColNum = 0;
+        LastSpace = 0;
+      } else if (OutStr[i] == ';') {             // Delete comments!
+        unsigned Idx = OutStr.find('\n', i + 1); // Find end of line
+        OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx);
+        --i;
+      } else if (ColNum == MaxColumns) { // Wrap lines.
+        // Wrap very long names even though we can't find a space.
+        if (!LastSpace)
+          LastSpace = i;
+        OutStr.insert(LastSpace, "\\l...");
+        ColNum = i - LastSpace;
+        LastSpace = 0;
+        i += 3; // The loop will advance 'i' again.
+      } else
+        ++ColNum;
+      if (OutStr[i] == ' ')
+        LastSpace = i;
+    }
+    return OutStr;
+  }
+  static std::string getNodeDescription(const MachineBasicBlock *SU,
+                                        const CFGWithPhi *G) {
+    return SU->getName().str();
+  }
+
+  static void addCustomGraphFeatures(CFGWithPhi *G,
+                                     GraphWriter<CFGWithPhi *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
+
+template <> struct GraphTraits<CFGWithPhi *> {
+  using NodeRef = MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::succ_iterator;
+  using nodes_iterator = pointer_iterator<MachineFunction::iterator>;
+
+  // static NodeRef getEntryNode(const CFGWithPhi *G) {
+  //  return G->F.getFunctionEntry();
+  //}
+
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return N->succ_begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); }
+
+  static nodes_iterator nodes_begin(const CFGWithPhi *G) {
+    return nodes_iterator(G->F.begin());
+  }
+
+  static nodes_iterator nodes_end(const CFGWithPhi *G) {
+    return nodes_iterator(G->F.end());
+  }
+};
+
+} // namespace llvm
+
+namespace llvm {
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI) {
+  unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg));
+  Size >>= 5;
+  LaneBitmask mask = Mask;
+  if (mask.any()) {
+    if (unsigned maskSize = mask.getNumLanes()) {
+      if (maskSize < Size)
+        Size = maskSize;
+    }
+  }
+  return Size;
+}
+
+void CollectLiveSetPressure(const LiveSet &liveSet,
+                            const MachineRegisterInfo &MRI,
+                            const SIRegisterInfo *SIRI, unsigned &VPressure,
+                            unsigned &SPressure) {
+  VPressure = 0;
+  SPressure = 0;
+  for (auto liveIt : liveSet) {
+    unsigned Reg = liveIt.first;
+    unsigned Size = getRegSize(Reg, liveIt.second, MRI, SIRI);
+    if (SIRI->isVGPR(MRI, Reg)) {
+      VPressure += Size;
+    } else {
+      SPressure += Size;
+    }
+  }
+}
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI) {
+  bool isExecUpdate = false;
+  unsigned opcode = MI.getOpcode();
+  if (opcode == AMDGPU::S_MOV_B64 || opcode == AMDGPU::S_MOV_B32 ||
+      opcode == AMDGPU::S_OR_B64_term || opcode == AMDGPU::S_OR_B32_term ||
+      opcode == AMDGPU::S_OR_SAVEEXEC_B64 ||
+      opcode == AMDGPU::S_OR_SAVEEXEC_B32 || opcode == AMDGPU::S_AND_B64 ||
+      opcode == AMDGPU::S_AND_B32 || opcode == AMDGPU::S_ANDN2_B64 ||
+      opcode == AMDGPU::S_ANDN2_B32) {
+    MachineOperand &Dst = MI.getOperand(0);
+    if (Dst.getReg() == AMDGPU::EXEC || Dst.getReg() == AMDGPU::EXEC_LO) {
+      isExecUpdate = true;
+    }
+  }
+  return isExecUpdate;
+}
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  // Support multi def for pattern of pointer:
+  // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+  // %808.sub1:sgpr_64 = S_MOV_B32 0
+  bool bHasSub0 = false;
+  bool bHasSub1 = false;
+  for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) {
+    if (unsigned SubReg = UserDefMO.getSubReg()) {
+      bool bSingleSubReg = false;
+      switch (SubReg) {
+      default:
+        break;
+      case AMDGPU::sub0:
+        if (!bHasSub0) {
+          bHasSub0 = true;
+          bSingleSubReg = true;
+        }
+        break;
+      case AMDGPU::sub1:
+        if (!bHasSub1) {
+          bHasSub1 = true;
+          bSingleSubReg = true;
+        }
+        break;
+      }
+      if (!bSingleSubReg) {
+        bHasSub0 = false;
+        break;
+      }
+    } else {
+      bHasSub0 = false;
+      break;
+    }
+  }
+
+  return (bHasSub0 && bHasSub1);
+}
+
+LaneBitmask getRegMask(const MachineOperand &MO,
+                       const MachineRegisterInfo &MRI) {
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0
+             ? MRI.getMaxLaneMaskForVReg(MO.getReg())
+             : MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(
+                   MO.getSubReg());
+}
+
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      mask |= targetMask;
+    }
+    targetSet[reg] = mask;
+  }
+}
+
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  GCNRPTracker::LiveRegSet AndSet;
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      mask &= targetMask;
+      AndSet[reg] = mask;
+    }
+  }
+
+  targetSet = AndSet;
+}
+
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
+  for (auto Reg : inputSet) {
+    unsigned reg = Reg.first;
+    LaneBitmask mask = Reg.second;
+    auto targetReg = targetSet.find(reg);
+    if (targetReg != targetSet.end()) {
+      LaneBitmask targetMask = targetReg->second;
+      if ((targetMask | mask) == mask)
+        targetSet.erase(reg);
+      else
+        targetSet[reg] = targetMask & (~mask);
+    }
+  }
+}
+
+MachineBasicBlock *split(MachineInstr *Inst) {
+  
+  // Create the fall-through block.
+  MachineBasicBlock *MBB = Inst->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+  auto MBBIter = ++(MBB->getIterator());
+  MF->insert(MBBIter, SuccMBB);
+  SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(SuccMBB);
+
+  // Splice the code over.
+  SuccMBB->splice(SuccMBB->end(), MBB, ++Inst->getIterator(), MBB->end());
+
+  return SuccMBB;
+}
+
+struct Piece {
+  unsigned Reg;
+  unsigned offset;
+  unsigned size;
+  static SmallVector<Piece, 8> split(std::bitset<32> mask) {
+
+    SmallVector<Piece, 8> pieces;
+    Piece piece = {0, 0, 0};
+    for (unsigned i = 0; i < 32; i++) {
+      if (mask.test(i)) {
+        if (piece.size == 0)
+          piece.offset = i;
+
+        piece.size++;
+        // Make sure no piece bigger than 8.
+        if (piece.size == 8) {
+          pieces.emplace_back(piece);
+          piece.size = 0;
+        }
+      } else {
+        if (piece.size == 0) {
+          continue;
+        }
+        pieces.emplace_back(piece);
+        piece.size = 0;
+      }
+    }
+    return pieces;
+  }
+};
+
+void updateSubReg(MachineOperand &UseMO, const llvm::TargetRegisterClass *NewRC,
+                  unsigned offset, const SIRegisterInfo *SIRI,
+                  const SIInstrInfo *SIII) {
+  unsigned size = NewRC->getLaneMask().getNumLanes();
+  if (size == 1) {
+    UseMO.setSubReg(0);
+  } else {
+    const uint32_t SubReg = UseMO.getSubReg();
+    LaneBitmask Mask = SIRI->getSubRegIndexLaneMask(SubReg);
+
+    unsigned mask = Mask.getAsInteger() >> offset;
+
+    unsigned NewSubReg = SIRI->getMinimalSpanningSubRegIdxSetForLaneMask(
+                                 NewRC, LaneBitmask(mask))
+                             .front();
+
+    UseMO.setSubReg(NewSubReg);
+  }
+}
+
+bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
+                   MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   const SIInstrInfo *SIII, SlotIndexes *SlotIndexes) {
+  MachineOperand &DstMO = MI.getOperand(0);
+  // Skip case when dst subReg not 0.
+  if (DstMO.getSubReg()) {
+    return false;
+  }
+  unsigned Reg = DstMO.getReg();
+
+  SmallVector<MachineOperand *, 2> UseMOs;
+  for (MachineOperand &UseMO : MRI.use_nodbg_operands(Reg)) {
+    UseMOs.emplace_back(&UseMO);
+  }
+
+  const llvm::TargetRegisterClass *NewRC =
+      SIRI->getRegClass(desc.operands().front().RegClass);
+  unsigned size = NewRC->getLaneMask().getNumLanes();
+  if (offset > 0) {
+    // Update offset operand in MI.
+    MachineOperand *OffsetOp =
+        SIII->getNamedOperand(MI, AMDGPU::OpName::offset);
+
+    const uint32_t LaneSize = sizeof(uint32_t);
+    if (OffsetOp) {
+      if (OffsetOp->isImm()) {
+        assert(OffsetOp != nullptr);
+        int64_t Offset = OffsetOp->getImm();
+        Offset += offset * LaneSize;
+        if (!SIII->isLegalMUBUFImmOffset(Offset)) {
+          return false;
+        }
+        OffsetOp->setImm(Offset);
+      } else {
+        return false;
+      }
+    } else {
+      OffsetOp = SIII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+      if (OffsetOp) {
+        unsigned NewOffsetReg =
+            MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        auto OffsetAdd = BuildMI(*MI.getParent()->getParent(), MI.getDebugLoc(),
+                                 SIII->get(AMDGPU::S_ADD_U32))
+                             .addDef(NewOffsetReg)
+                             .add(*OffsetOp)
+                             .addImm(offset * LaneSize);
+        MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
+        MachineBasicBlock::iterator InsertPoint =
+            llvm::FindOrCreateInsertionPointForSccDef(
+                MI.getParent(), MI, SIRI, SIII, &MRI
+            );
+        MI.getParent()->insert(InsertPoint, OffsetAddMI);
+        SIII->legalizeOperands(*OffsetAddMI);
+        OffsetOp->setReg(NewOffsetReg);
+        OffsetOp->setSubReg(0);
+        if (SlotIndexes)
+          SlotIndexes->insertMachineInstrInMaps(*OffsetAddMI);
+      } else {
+        return false;
+      }
+    }
+    // Update subReg for users.
+    for (MachineOperand *UseMO : UseMOs) {
+      updateSubReg(*UseMO, NewRC, offset, SIRI, SIII);
+    }
+  } else if (size == 1) {
+    // Clear subReg when size is 1.
+    for (MachineOperand *UseMO : UseMOs) {
+      UseMO->setSubReg(0);
+    }
+  }
+
+  MI.setDesc(desc);
+  // Mutate reg class of Reg.
+  MRI.setRegClass(Reg, NewRC);
+  return true;
+}
+
+bool removeUnusedLanes(llvm::MachineInstr &MI, MachineRegisterInfo &MRI,
+                       const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                       SlotIndexes *SlotIndexes) {
+  bool bImm = false;
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+    bImm = true;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+    unsigned Reg = MI.getOperand(0).getReg();
+    if (!MRI.getUniqueVRegDef(Reg))
+      return false;
+    LaneBitmask dstMask = getRegMask(MI.getOperand(0), MRI);
+    LaneBitmask UseMask;
+    for (MachineOperand &MO : MRI.use_operands(Reg)) {
+      UseMask |= llvm::getRegMask(MO, MRI);
+    }
+
+    const unsigned fullMask = dstMask.getAsInteger();
+    unsigned mask = UseMask.getAsInteger();
+    if (mask == fullMask)
+      return false;
+    // Split mask when there's gap. Then group mask to 2/4/8.
+    auto pieces = Piece::split(std::bitset<32>(mask));
+    // Now only support 1 piece.
+    if (pieces.size() != 1)
+      return false;
+    auto piece = pieces[0];
+    if (piece.size > 8)
+      return false;
+
+    // TODO: enable offset support when bImm is true.
+    // Now if break different test when mul LaneSize or not mul for the offset.
+    if (bImm && piece.offset != 0)
+      return false;
+
+    switch (piece.size) {
+    default:
+      return false;
+    case 1:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORD_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORD_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 2:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 3:
+      if (fullMask == 0xf)
+        return false;
+    case 4:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    case 5:
+    case 6:
+    case 7:
+      if (fullMask == 0xff)
+        return false;
+    case 8:
+      return reduceChannel(piece.offset, MI,
+                           SIII->get(bImm ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM
+                                          : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR),
+                           MRI, SIRI, SIII, SlotIndexes);
+    }
+
+  } break;
+  }
+  return false;
+}
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI,
+                                    const MachineBasicBlock *BB) {
+  const MachineLoop *L = LI->getLoopFor(BB);
+  if (L) {
+    while (const MachineLoop *Parent = L->getParentLoop())
+      L = Parent;
+  }
+  return L;
+}
+
+// True if there is a loop which contains both BB1 and BB2.
+bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1,
+                      const MachineBasicBlock *BB2) {
+  const MachineLoop *L1 = getOutermostLoop(LI, BB1);
+  const MachineLoop *L2 = getOutermostLoop(LI, BB2);
+  return L1 != nullptr && L1 == L2;
+}
+
+bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT,
+                 MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                 MachineBasicBlock *ToBB) {
+  if (FromBB == ToBB) {
+    return true;
+  }
+
+  if (DT->dominates(FromBB, ToBB)) {
+    return true;
+  }
+
+  if (PDT->dominates(ToBB, FromBB)) {
+    return true;
+  }
+
+  if (loopContainsBoth(LI, ToBB, FromBB)) {
+    return true;
+  }
+  // TODO: cover case hotBB in loop,
+  //       one block in that loop dom BB or
+  //       BB post dom one block in that loop.
+  return false;
+}
+
+// If BB can reach hotMBBs.
+bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
+                  MachinePostDominatorTree *PDT, MachineLoopInfo *LI,
+                  DenseSet<MachineBasicBlock *> &hotMBBs) {
+  bool bCross = false;
+  for (MachineBasicBlock *hotBB : hotMBBs) {
+    if (reach_block(BB, DT, PDT, LI, hotBB)) {
+      bCross = true;
+      break;
+    }
+  }
+  return bCross;
+}
+
+}
+
+namespace llvm {
+void viewCFGWithPhi(llvm::MachineFunction &F) {
+#ifdef DBG
+  CFGWithPhi G(F);
+  ViewGraph(const_cast<CFGWithPhi *>(&G), F.getName(), false, F.getName());
+  G.dump();
+#endif
+}
+} // namespace llvm
+
+namespace llvm {
+bool GetNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd,
+                       MachineBasicBlock &MBB) {
+  // R.End doesn't point to the boundary instruction.
+  // Skip Debug instr.
+  while (BBEnd != MBB.rend() && BBEnd->isDebugInstr())
+    BBEnd++;
+  return BBEnd != MBB.rend();
+}
+} // namespace llvm
+
+// Helper functions to write jason.
+namespace {
+void json_name(StringRef Val, raw_ostream &os) { os << "\"" << Val << "\":"; }
+
+template <typename write_fn>
+void json_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+  os << "\"";
+  fn();
+  os << "\"";
+}
+
+template <typename write_fn>
+void json_obj_pair(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+
+  fn();
+}
+
+template <typename write_fn>
+void json_array(StringRef Val, write_fn &fn, raw_ostream &os) {
+  json_name(Val, os);
+  os << "[";
+  fn();
+  os << "]";
+}
+} // namespace
+
+namespace llvm {
+namespace pressure {
+
+void write_inst(MachineInstr &MI, const SlotIndexes *SlotIndexes,
+                const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{";
+  SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeOpcode = [&MI, &SIII, &os]() {
+    os << SIII->getName(MI.getOpcode());
+  };
+
+  json_pair("opcode", writeOpcode, os);
+
+  os << ",";
+
+  auto writeAsm = [&MI, &SIII, &os]() {
+    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+             /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+  };
+  json_pair("asm", writeAsm, os);
+
+  os << "}";
+}
+
+void print_reg(Register Reg, const MachineRegisterInfo &MRI,
+               const SIRegisterInfo *SIRI, raw_ostream &os) {
+  if (Reg.isVirtual()) {
+    StringRef Name = MRI.getVRegName(Reg);
+    if (Name != "") {
+      os << '%' << Name;
+    } else {
+      os << '%' << Register::virtReg2Index(Reg);
+    }
+  } else if (Reg < SIRI->getNumRegs()) {
+    os << '$';
+    printLowerCase(SIRI->getName(Reg), os);
+  } else {
+    llvm_unreachable("invalid reg");
+  }
+}
+
+void write_reg(unsigned Reg, unsigned SubReg, const MachineRegisterInfo &MRI,
+               const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &os]() { print_reg(Reg, MRI, SIRI, os); };
+  json_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeSubReg = [&SubReg, &os]() { os << SubReg; };
+
+  json_pair("sub_reg", writeSubReg, os);
+
+  os << ",";
+  auto writeIsSgpr = [&Reg, &MRI, &SIRI, &os]() {
+    if (SIRI->isSGPRReg(MRI, Reg))
+      os << "true";
+    else
+      os << "false";
+  };
+  json_obj_pair("is_sgpr", writeIsSgpr, os);
+  os << "}";
+}
+
+unsigned get_reg_size(unsigned Reg, const MachineRegisterInfo &MRI,
+                      const SIRegisterInfo *SIRI) {
+  return SIRI->getRegClassForReg(MRI, Reg)->getLaneMask().getNumLanes();
+}
+
+void write_live(unsigned Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI,
+                const SIRegisterInfo *SIRI, raw_ostream &os) {
+  if (Mask.none()) {
+    unsigned size = get_reg_size(Reg, MRI, SIRI);
+    Mask = LaneBitmask((1 << size) - 1);
+  }
+  unsigned mask = Mask.getAsInteger();
+  for (unsigned i = 0; i <= Mask.getHighestLane(); i++) {
+    if (mask & (1 << i)) {
+      write_reg(Reg, i, MRI, SIRI, os);
+      os << ",\n";
+    }
+  }
+}
+
+void write_dag_input_node(unsigned ID, unsigned reg, unsigned mask,
+                          const MachineRegisterInfo &MRI,
+                          const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{";
+  auto writeID = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeID, os);
+
+  os << ",";
+
+  auto writeReg = [&reg, &MRI, &SIRI, &os]() { print_reg(reg, MRI, SIRI, os); };
+
+  json_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeMask = [&mask, &os]() { os << mask; };
+
+  json_pair("mask", writeMask, os);
+
+  os << "},\n";
+}
+
+void write_dag_inst_node(unsigned ID, SlotIndex Slot,
+                         GCNRPTracker::LiveRegSet LiveReg,
+                         const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI, SUnit *SU,
+                         raw_ostream &os) {
+  os << "{";
+  auto writeID = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeID, os);
+
+  os << ",";
+
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeRegs = [&LiveReg, &MRI, &SIRI, &os]() {
+    for (auto it : LiveReg) {
+      unsigned Reg = it.first;
+      LaneBitmask Mask = it.second;
+      write_live(Reg, Mask, MRI, SIRI, os);
+    }
+  };
+  json_array("regs", writeRegs, os);
+
+  os << ",";
+
+  auto writePreds = [&SU, &os]() {
+    for (auto &Pred : SU->Preds) {
+
+      os << Pred.getSUnit()->NodeNum << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << "},\n";
+}
+
+void write_block(MachineBasicBlock &Blk, LiveIntervals *LIS,
+                 const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                 const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{\n";
+  auto writeName = [&Blk, &os]() { os << Blk.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",";
+
+  auto writeIndex = [&Blk, &os]() { os << Blk.getNumber(); };
+  json_pair("id", writeIndex, os);
+
+  os << ",";
+
+  const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+  SlotIndex BeginSlot = SlotIndexes->getMBBStartIdx(&Blk);
+  auto writeSlot = [&BeginSlot, &os]() { BeginSlot.print(os); };
+  json_pair("begin_slot", writeSlot, os);
+
+  os << ",";
+
+  SlotIndex EndSlot = SlotIndexes->getMBBEndIdx(&Blk);
+  auto writeEndSlot = [&EndSlot, &os]() { EndSlot.print(os); };
+  json_pair("end_slot", writeEndSlot, os);
+
+  os << ",";
+
+  auto writeInsts = [&Blk, &SlotIndexes, &SIII, &os]() {
+    for (MachineInstr &MI : Blk) {
+      if (MI.isDebugInstr())
+        continue;
+      write_inst(MI, SlotIndexes, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("instructions", writeInsts, os);
+
+  os << ",";
+
+  BlockExpDag dag(&Blk, LIS, MRI, SIRI, SIII);
+  dag.buildWithPressure();
+
+  const auto StartLiveReg = llvm::getLiveRegs(BeginSlot, *dag.LIS, dag.MRI);
+  auto writeInputs = [&StartLiveReg, &dag, &os]() {
+    for (auto it : StartLiveReg) {
+      unsigned Reg = it.first;
+      LaneBitmask mask = it.second;
+      SUnit *SU = dag.InputSUnitMap[Reg];
+      // Write Reg and mask to the nodes.
+      write_dag_input_node(SU->NodeNum, Reg, mask.getAsInteger(), dag.MRI,
+                           dag.SIRI, os);
+    }
+  };
+
+  json_array("input_nodes", writeInputs, os);
+
+  os << ",";
+
+  auto writeNodes = [&SlotIndexes, &dag, &os]() {
+    for (auto it : dag.MISUnitMap) {
+      MachineInstr *MI = it.first;
+      SUnit *SU = it.second;
+      // Use SlotIndex of MI.
+      SlotIndex SlotIndex;
+      if (!MI->isDebugInstr())
+        SlotIndex = SlotIndexes->getInstructionIndex(*MI);
+      GCNRPTracker::LiveRegSet LiveReg = dag.DagPressureMap[SU];
+      // Write slot, live to the nodes.
+      write_dag_inst_node(SU->NodeNum, SlotIndex, LiveReg, dag.MRI, dag.SIRI,
+                          SU, os);
+    }
+  };
+
+  json_array("inst_nodes", writeNodes, os);
+
+  os << ",";
+
+  auto writePreds = [&Blk, &os]() {
+    for (MachineBasicBlock *Pred : Blk.predecessors()) {
+      os << Pred->getNumber() << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << ",";
+
+  auto writeSuccs = [&Blk, &os]() {
+    for (MachineBasicBlock *Succ : Blk.successors()) {
+      os << Succ->getNumber() << ",";
+    }
+  };
+
+  json_array("succs", writeSuccs, os);
+
+  os << "}";
+}
+
+void write_define(SlotIndex &Slot, unsigned Reg, unsigned SubReg,
+                  const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  raw_ostream &os) {
+  os << "{";
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &SubReg, &os]() {
+    write_reg(Reg, SubReg, MRI, SIRI, os);
+  };
+  json_obj_pair("reg", writeReg, os);
+
+  os << "}\n";
+
+  os << ",";
+}
+
+void write_define(MachineOperand &MO, const SlotIndexes *SlotIndexes,
+                  const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                  raw_ostream &os) {
+  // Split subReg?  MO.getSubReg();
+  unsigned Reg = MO.getReg();
+  unsigned SubReg = MO.getSubReg();
+  MachineInstr *MI = MO.getParent();
+  SlotIndex Slot = SlotIndexes->getInstructionIndex(*MI);
+  if (SubReg == 0) {
+    unsigned size = get_reg_size(Reg, MRI, SIRI);
+    for (unsigned i = 0; i < size; i++) {
+      write_define(Slot, Reg, i, MRI, SIRI, os);
+    }
+  } else {
+    switch (SubReg) {
+    default:
+      assert(0 && "SubReg not supported yet.");
+      write_define(Slot, Reg, SubReg, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub1:
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub3:
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub4:
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub5:
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub6:
+      write_define(Slot, Reg, 6, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub7:
+      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub8:
+      write_define(Slot, Reg, 8, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub9:
+      write_define(Slot, Reg, 9, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub10:
+      write_define(Slot, Reg, 10, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub11:
+      write_define(Slot, Reg, 11, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub12:
+      write_define(Slot, Reg, 12, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub13:
+      write_define(Slot, Reg, 13, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub14:
+      write_define(Slot, Reg, 14, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub15:
+      write_define(Slot, Reg, 15, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2_sub3:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub4_sub5:
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub1_sub2:
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2_sub3:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub2_sub3_sub4_sub5:
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      break;
+    case AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7:
+      write_define(Slot, Reg, 0, MRI, SIRI, os);
+      write_define(Slot, Reg, 1, MRI, SIRI, os);
+      write_define(Slot, Reg, 2, MRI, SIRI, os);
+      write_define(Slot, Reg, 3, MRI, SIRI, os);
+      write_define(Slot, Reg, 4, MRI, SIRI, os);
+      write_define(Slot, Reg, 5, MRI, SIRI, os);
+      write_define(Slot, Reg, 6, MRI, SIRI, os);
+      write_define(Slot, Reg, 7, MRI, SIRI, os);
+      break;
+    }
+  }
+}
+
+void write_defines(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   raw_ostream &os) {
+
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+    auto Reg = Register::index2VirtReg(i);
+
+    for (MachineOperand &MO : MRI.def_operands(Reg)) {
+      write_define(MO, SlotIndexes, MRI, SIRI, os);
+    }
+  }
+}
+
+void write_uses(MachineFunction &MF, const SlotIndexes *SlotIndexes,
+
+                const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                raw_ostream &os) {
+
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+    auto Reg = Register::index2VirtReg(i);
+
+    for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+      // TODO: create write_use if use has more info.
+      write_define(MO, SlotIndexes, MRI, SIRI, os);
+    }
+  }
+}
+
+void write_liveness(SlotIndex Slot, GCNRPTracker::LiveRegSet &LiveSet,
+                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                    raw_ostream &os) {
+  os << "{";
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",";
+
+  auto writeRegs = [&LiveSet, &MRI, &SIRI, &os]() {
+    for (auto it : LiveSet) {
+      unsigned Reg = it.first;
+      LaneBitmask Mask = it.second;
+      write_live(Reg, Mask, MRI, SIRI, os);
+    }
+  };
+  json_array("regs", writeRegs, os);
+  os << "\n},\n";
+}
+
+void write_segment(const LiveInterval::Segment &S, raw_ostream &os) {
+  os << "{";
+  auto writeBegin = [&S, &os]() { S.start.print(os); };
+
+  json_pair("begin", writeBegin, os);
+
+  os << ",";
+
+  auto writeEnd = [&S, &os]() { S.end.print(os); };
+
+  json_pair("end", writeEnd, os);
+
+  os << ",";
+
+  auto writeValNum = [&S, &os]() {
+    if (S.valno)
+      os << S.valno->id;
+    else
+      os << 0xFFFFFFFF;
+  };
+
+  json_pair("val_num", writeValNum, os);
+
+  os << "},\n";
+}
+
+void write_subrange(const LiveInterval::SubRange &SR, raw_ostream &os) {
+  os << "{\n";
+  auto writeMask = [&SR, &os]() { os << SR.LaneMask.getAsInteger(); };
+
+  json_pair("mask", writeMask, os);
+
+  os << ",";
+
+  // Segments.
+  auto writeSegments = [&SR, &os]() {
+    for (auto &S : SR.segments) {
+      write_segment(S, os);
+    }
+  };
+
+  json_array("segments", writeSegments, os);
+
+  os << "\n},\n";
+}
+
+void write_live_interval(LiveInterval &LI, const MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI, raw_ostream &os) {
+  os << "{\n";
+
+  auto writeReg = [&LI, &MRI, &SIRI, &os]() {
+    write_reg(LI.reg(), 0, MRI, SIRI, os);
+  };
+
+  json_obj_pair("reg", writeReg, os);
+
+  os << ",";
+
+  auto writeSegments = [&LI, &os]() {
+    for (auto &S : LI.segments) {
+      write_segment(S, os);
+    }
+  };
+
+  json_array("segments", writeSegments, os);
+
+  os << ",";
+
+  auto writeSubRanges = [&LI, &os]() {
+    for (auto &SR : LI.subranges()) {
+      write_subrange(SR, os);
+    }
+  };
+
+  json_array("subranges", writeSubRanges, os);
+
+  os << "},\n";
+}
+
+std::string get_legal_str(const MDString *MDStr) {
+  std::string str;
+  raw_string_ostream Stream(str);
+  MDStr->print(Stream);
+  Stream.flush();
+  // Remove !.
+  str = str.substr(1);
+  // Remove ""
+  str = str.substr(1);
+  str.pop_back();
+  std::replace(str.begin(), str.end(), '\\', '#');
+  return str;
+}
+
+void write_file(const MDNode *FileNode, raw_ostream &os) {
+  const MDString *FileName = cast<MDString>(FileNode->getOperand(0).get());
+  StringRef fileNameStr = FileName->getString();
+  if (fileNameStr.find("__AMDGPU_GPUMAP_") == 0)
+    return;
+  if (fileNameStr.find("__AMDGPU_DWARF_") == 0)
+    return;
+
+  os << "{";
+
+  std::string str0 = get_legal_str(FileName);
+  auto writeName = [&str0, &os]() { os << str0; };
+  json_pair("filename", writeName, os);
+
+  os << ",\n";
+
+  const MDString *Content = cast<MDString>(FileNode->getOperand(1).get());
+  std::string str = get_legal_str(Content);
+  auto writeContent = [&str, &os]() { os << str; };
+  json_pair("content", writeContent, os);
+  os << "\n},\n";
+}
+
+void write_DIFile(const DIFile *File, raw_ostream &os) {
+  if (File) {
+    std::string name = get_legal_str(File->getRawFilename());
+    std::string dir = "";
+    if (MDString *MDDir = File->getRawDirectory())
+      dir = get_legal_str(MDDir);
+    os << dir << name;
+  } else {
+    os << "ArtificialFile";
+  }
+}
+
+void write_line_mapping(SlotIndex Slot, DebugLoc DL, raw_ostream &os) {
+  os << "{";
+
+  auto writeSlot = [&Slot, &os]() { Slot.print(os); };
+
+  json_pair("slot_index", writeSlot, os);
+
+  os << ",\n";
+
+  MDNode *Scope = DL.getScope();
+  unsigned line = DL.getLine();
+  unsigned col = DL.getCol();
+
+  auto writeLine = [&line, &os]() { os << line; };
+  json_pair("line", writeLine, os);
+
+  os << ",\n";
+
+  auto writeCol = [&col, &os]() { os << col; };
+  json_pair("col", writeCol, os);
+
+  os << ",\n";
+
+  auto writeFile = [&Scope, &os]() {
+    const DIFile *File = cast<DIScope>(Scope)->getFile();
+    write_DIFile(File, os);
+  };
+  json_pair("file", writeFile, os);
+
+  if (DILocation *inlineDL = DL.getInlinedAt()) {
+    os << ",\n";
+    unsigned inlineLine = inlineDL->getLine();
+    auto writeLine = [&inlineLine, &os]() { os << inlineLine; };
+    json_pair("inline_line", writeLine, os);
+
+    os << ",\n";
+
+    unsigned inlineCol = inlineDL->getColumn();
+    auto writeCol = [&inlineCol, &os]() { os << inlineCol; };
+    json_pair("inline_col", writeCol, os);
+
+    os << ",\n";
+
+    const MDNode *InlineScope = DL.getInlinedAtScope();
+    auto writeFile = [&InlineScope, &os]() {
+      const DIFile *File = cast<DIScope>(InlineScope)->getFile();
+      write_DIFile(File, os);
+    };
+    json_pair("inline_file", writeFile, os);
+  }
+
+  os << "\n},\n";
+}
+
+void write_dbg_val(unsigned Reg, const DIVariable *V, const DIExpression *Exp,
+                   const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+                   raw_ostream &os) {
+  os << "{";
+
+  auto writeReg = [&MRI, &SIRI, &Reg, &os]() {
+    const unsigned SubReg = 0;
+    write_reg(Reg, SubReg, MRI, SIRI, os);
+  };
+  json_obj_pair("reg", writeReg, os);
+
+  os << ",\n";
+
+  if (V) {
+    auto writeName = [&V, &os]() { os << V->getName(); };
+    json_pair("debug_val_name", writeName, os);
+    os << ",\n";
+
+    auto writeFile = [&V, &os]() {
+      const DIFile *File = V->getFile();
+      write_DIFile(File, os);
+    };
+    json_pair("debug_val_file", writeFile, os);
+    os << ",\n";
+
+    auto writeLine = [&V, &os]() { os << V->getLine(); };
+    json_pair("debug_val_line", writeLine, os);
+  }
+
+  if (Exp->isValid() && Exp->getNumElements()) {
+    os << ",\n";
+    auto writeV = [&Exp, &os]() {
+      os << '[';
+      bool NeedSep = false;
+      for (auto Op : Exp->expr_ops()) {
+        if (NeedSep)
+          os << ", ";
+        else
+          NeedSep = true;
+        os << dwarf::OperationEncodingString(Op.getOp());
+        for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+          os << ' ' << Op.getArg(I);
+      }
+      os << "] ";
+    };
+    json_pair("debug_exp", writeV, os);
+  }
+  os << "\n},\n";
+}
+
+void write_dbg_info(MachineFunction &MF, LiveIntervals *LIS,
+                    const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+                    const SIRegisterInfo *SIRI, const SlotIndexes *SlotIndexes,
+                    const NamedMDNode *SourceMD, raw_ostream &os) {
+  os << ",\n";
+
+  auto writeFiles = [&SourceMD, &os]() {
+    for (const MDNode *FileNode : SourceMD->operands()) {
+      write_file(FileNode, os);
+    }
+  };
+
+  json_array("files", writeFiles, os);
+
+  os << ",\n";
+
+  auto writeLineMapping = [&MF, &SlotIndexes, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr()) {
+          continue;
+        }
+        const DebugLoc DL = MI.getDebugLoc();
+        if (!DL)
+          continue;
+        SlotIndex Slot = SlotIndexes->getInstructionIndex(MI);
+        write_line_mapping(Slot, DL, os);
+      }
+    }
+  };
+
+  json_array("line_mapping", writeLineMapping, os);
+
+  os << ",\n";
+
+  auto writeDebugVals = [&MF, &MRI, &SIRI, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (!MI.isDebugValue())
+          continue;
+
+        MachineOperand &Reg = MI.getOperand(0);
+        if (!Reg.isReg())
+          continue;
+
+        if (Reg.getReg() == 0)
+          continue;
+
+        const DIVariable *V = MI.getDebugVariable();
+        const DIExpression *Exp = MI.getDebugExpression();
+        write_dbg_val(Reg.getReg(), V, Exp, MRI, SIRI, os);
+      }
+    }
+  };
+
+  json_array("debug_vals", writeDebugVals, os);
+}
+
+void write_function(MachineFunction &MF, LiveIntervals *LIS,
+                    const MachineRegisterInfo &MRI, const SIInstrInfo *SIII,
+                    const SIRegisterInfo *SIRI, raw_ostream &os) {
+  const SlotIndexes *SlotIndexes = LIS->getSlotIndexes();
+
+  os << "{\n";
+  auto writeName = [&MF, &os]() { os << MF.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",\n";
+
+  auto writeBlocks = [&MF, &SlotIndexes, &LIS, &MRI, &SIRI, &SIII, &os]() {
+    for (MachineBasicBlock &MBB : MF) {
+      write_block(MBB, LIS, MRI, SIRI, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("blocks", writeBlocks, os);
+
+  os << ",\n";
+
+  auto writeDefines = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+    write_defines(MF, SlotIndexes, MRI, SIRI, os);
+  };
+
+  json_array("defines", writeDefines, os);
+
+  os << ",\n";
+
+  auto writeUses = [&MF, &SlotIndexes, &MRI, &SIRI, &os]() {
+    write_uses(MF, SlotIndexes, MRI, SIRI, os);
+  };
+
+  json_array("uses", writeUses, os);
+
+  os << ",\n";
+
+  auto writeLiveness = [&MF, &LIS, &MRI, &SIRI, &os]() {
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        const SlotIndex &SI = LIS->getInstructionIndex(MI).getBaseIndex();
+        GCNRPTracker::LiveRegSet LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
+        write_liveness(SI, LISLR, MRI, SIRI, os);
+      }
+  };
+
+  json_array("liveness", writeLiveness, os);
+
+  os << ",\n";
+
+  auto writeLiveIntervals = [&MRI, &SIRI, &LIS, &os]() {
+    for (unsigned i = 0; i < MRI.getNumVirtRegs(); i++) {
+      auto Reg = Register::index2VirtReg(i);
+      if (!LIS->hasInterval(Reg))
+        continue;
+      auto &LI = LIS->getInterval(Reg);
+      write_live_interval(LI, MRI, SIRI, os);
+    }
+  };
+
+  json_array("live_intervals", writeLiveIntervals, os);
+
+#if 0 // TODO: Do we need this?
+  // Check debug info.
+  const Function &F = MF.getFunction();
+  const Module *M = F.getParent();
+  const NamedMDNode *SourceMD =
+      M->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName);
+  if (SourceMD) {
+    write_dbg_info(MF, LIS, MRI, SIII, SIRI, SlotIndexes, SourceMD, os);
+  }
+#endif
+
+  os << "\n}";
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS,
+                    const char *Filename) {
+  int FD = -1;
+  SmallString<128> TmpFilename(Filename);
+  std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+  const auto *SIRI = ST->getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  write_function(MF, LIS, MRI, SIII, SIRI, O);
+  O.flush();
+  O.close();
+}
+
+void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+  const auto *SIRI = ST->getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  write_function(MF, LIS, MRI, SIII, SIRI, os);
+  os.flush();
+}
+
+} // namespace pressure
+}// namespace llvm
+
+namespace {
+class ContributionList {
+public:
+  ContributionList(MachineFunction &MF) : MF(MF){};
+  void build();
+  bool propagateContribution();
+  MachineFunction &MF;
+  DenseMap<MachineInstr *, unsigned> MIIndexMap;
+  // Set of inst which contribute to build the key MachineInstr.
+  DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributorMap;
+  // Set of inst which been contributed by the key MachineInstr.
+  DenseMap<MachineInstr *, DenseSet<MachineInstr *>> MIContributedToMap;
+  void writeInst(MachineInstr &MI, const SIInstrInfo *SIII, raw_ostream &os);
+  void writeBlock(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
+                  raw_ostream &os);
+  void write(raw_ostream &os);
+};
+
+void buildMIContribution(MachineInstr &MI,
+                         DenseSet<MachineInstr *> &ContributorSet,
+                         DenseSet<MachineInstr *> &ContributedSet,
+                         const SIRegisterInfo &SIRI, MachineRegisterInfo &MRI) {
+  for (MachineOperand &UseMO : MI.uses()) {
+    if (!UseMO.isReg())
+      continue;
+    Register Reg = UseMO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    if (UseMO.isImplicit()) {
+      // if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+      //    Reg == AMDGPU::SCC)
+      continue;
+    }
+    for (MachineInstr &DefMI : MRI.def_instructions(Reg)) {
+      ContributorSet.insert(&DefMI);
+    }
+  }
+
+  for (MachineOperand &DstMO : MI.defs()) {
+    if (!DstMO.isReg())
+      continue;
+    if (DstMO.isImplicit())
+      continue;
+    Register Reg = DstMO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
+      ContributedSet.insert(&UseMI);
+    }
+  }
+}
+
+bool ContributionList::propagateContribution() {
+  bool bUpdated = false;
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto *MBB : RPOT) {
+    for (auto &MI : *MBB) {
+      auto &contributors = MIContributorMap[&MI];
+      unsigned size = contributors.size();
+      DenseSet<MachineInstr *> parentContributors;
+      for (auto *CMI : contributors) {
+        auto &pContributors = MIContributorMap[CMI];
+        parentContributors.insert(pContributors.begin(), pContributors.end());
+      }
+      contributors.insert(parentContributors.begin(), parentContributors.end());
+      bUpdated |= size < contributors.size();
+    }
+  }
+  return bUpdated;
+}
+
+void ContributionList::build() {
+  // Build contribution.
+  auto &MRI = MF.getRegInfo();
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIRI = ST->getRegisterInfo();
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      auto &contributors = MIContributorMap[&MI];
+      auto &contributed = MIContributedToMap[&MI];
+      buildMIContribution(MI, contributors, contributed, *SIRI, MRI);
+    }
+  }
+  // propagate contribution.
+  bool bUpdated = true;
+  while (bUpdated) {
+    bUpdated = propagateContribution();
+  }
+}
+
+void ContributionList::writeInst(MachineInstr &MI, const SIInstrInfo *SIII,
+                                 raw_ostream &os) {
+  os << "\n{\n";
+  unsigned ID = MIIndexMap[&MI];
+  auto writeSlot = [&ID, &os]() { os << ID; };
+
+  json_pair("ID", writeSlot, os);
+
+  os << ",";
+
+  auto writeAsm = [&MI, &SIII, &os]() {
+    MI.print(os, /*IsStandalone*/ true, /*SkipOpers*/ false,
+             /*SkipDebugLoc*/ true, /*AddNewLine*/ false, SIII);
+  };
+  json_pair("asm", writeAsm, os);
+
+  os << ",\n";
+
+  auto &contributors = MIContributorMap[&MI];
+  auto writeContributor = [&contributors, this, &os]() {
+    for (auto *MI : contributors) {
+      unsigned ID = MIIndexMap[MI];
+      os << ID << ",";
+    }
+  };
+
+  json_array("contributors", writeContributor, os);
+  os << ",\n";
+
+  auto &contributeds = MIContributedToMap[&MI];
+  auto writeContributed = [&contributeds, this, &os]() {
+    for (auto *MI : contributeds) {
+      unsigned ID = MIIndexMap[MI];
+      os << ID << ",";
+    }
+  };
+
+  json_array("contributed", writeContributed, os);
+  os << "\n}\n";
+}
+
+void ContributionList::writeBlock(MachineBasicBlock &MBB,
+                                  const SIInstrInfo *SIII, raw_ostream &os) {
+  os << "{\n";
+  auto writeName = [&MBB, &os]() { os << MBB.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",";
+
+  auto writeIndex = [&MBB, &os]() { os << MBB.getNumber(); };
+  json_pair("id", writeIndex, os);
+
+  os << ",\n";
+
+  auto writeInsts = [this, &MBB, &SIII, &os]() {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      writeInst(MI, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("instructions", writeInsts, os);
+
+  os << ",\n";
+
+  auto writePreds = [&MBB, &os]() {
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      os << Pred->getNumber() << ",";
+    }
+  };
+
+  json_array("preds", writePreds, os);
+
+  os << ",";
+
+  auto writeSuccs = [&MBB, &os]() {
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      os << Succ->getNumber() << ",";
+    }
+  };
+
+  json_array("succs", writeSuccs, os);
+
+  os << "}";
+}
+
+void ContributionList::write(raw_ostream &os) {
+  unsigned ID = 0;
+  // Build ID for write.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto *MBB : RPOT) {
+    for (auto &MI : *MBB) {
+      MIIndexMap[&MI] = ID++;
+    }
+  }
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const auto *SIII = ST->getInstrInfo();
+
+  os << "{\n";
+  auto writeName = [this, &os]() { os << MF.getName(); };
+  json_pair("name", writeName, os);
+
+  os << ",\n";
+
+  auto writeBlocks = [this, &SIII, &RPOT, &os]() {
+    for (auto *MBB : RPOT) {
+      writeBlock(*MBB, SIII, os);
+      os << ",\n";
+    }
+  };
+
+  json_array("blocks", writeBlocks, os);
+
+  os << "\n}";
+}
+} // namespace
+
+namespace llvm {
+
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
+  int FD = -1;
+  SmallString<128> TmpFilename(Filename);
+  std::error_code EC = sys::fs::createUniqueFile(TmpFilename, FD, TmpFilename);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return;
+  }
+
+  raw_fd_ostream O(FD, /*shouldClose=*/true);
+  ContributionList CL(MF);
+  CL.build();
+
+  CL.write(O);
+
+  O.flush();
+  O.close();
+}
+} // namespace llvm
+
+static bool IsPhysReg(const MachineOperand &Op)
+{
+    return Op.isReg() && Op.getReg().isPhysical();
+}
+
+// Sometimes split bb uses physical registers defined in BB, have to add them to
+// live-in or the ir is malformed.
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
+{
+    // Initialize with current set of liveins. For new blocks this will be empty.
+    SmallDenseSet<unsigned, 8> DefSet;
+    for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
+    {
+        DefSet.insert(P.PhysReg);
+    }
+
+    for (auto &MI : *NewBB)
+    {
+        // Add all undefined physical registers to the live in set.
+        for (MachineOperand &Use : MI.operands())
+        {
+            // Only process physreg uses.
+            if (!IsPhysReg(Use) || !Use.isUse()) continue;
+
+            // Reserved regs do not need to be tracked through live-in sets.
+            unsigned Reg = Use.getReg();
+            if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+
+            if (!DefSet.count(Reg))
+                NewBB->addLiveIn(Reg);
+        }
+
+        // Add all physical register defs (exlicit+implicit) to the def register set.
+        for (MachineOperand &Def : MI.operands()) 
+        {
+            // Only process physreg defs.
+            if (!IsPhysReg(Def) || !Def.isDef()) continue;
+            DefSet.insert(Def.getReg());
+        }
+    }
+}
+
+void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+                                      SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                      const MachineRegisterInfo *MRI) {
+  for (auto rit = NewBB->rbegin(); rit != NewBB->rend(); rit++) {
+    auto &MI = *rit;
+    // Add all physical register defs (exlicit+implicit) to the def register
+    // set.
+    for (MachineOperand &Def : MI.operands()) {
+      // Only process physreg defs.
+      if (!IsPhysReg(Def) || !Def.isDef())
+        continue;
+      LiveOutSet.erase(Def.getReg());
+    }
+    // Add all undefined physical registers to the live in set.
+    for (MachineOperand &Use : MI.operands()) {
+      // Only process physreg uses.
+      if (!IsPhysReg(Use) || !Use.isUse())
+        continue;
+
+      // Reserved regs do not need to be tracked through live-in sets.
+      unsigned Reg = Use.getReg();
+      if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+        continue;
+
+      if (!LiveOutSet.count(Reg))
+        LiveOutSet.insert(Reg);
+    }
+  }
+  for (unsigned Reg : LiveOutSet) {
+    NewBB->addLiveIn(Reg);
+  }
+}
+
+MachineReg llvm::CreateVirtualRegForOperand(
+    MachineOpcode Opcode,
+    unsigned OpNum,
+    MachineFunction &MF
+)
+{
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+    const TargetInstrInfo *TII = ST.getInstrInfo();
+    const MCInstrDesc &Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+    if (!RC)
+    {
+        llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
+    }
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    return MRI.createVirtualRegister(RC);
+}
+
+MachineReg llvm::CreateVirtualDstReg(
+    MachineOpcode Opcode,
+    MachineFunction &MF
+)
+{
+    return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+}
+
+// Return true if the MI is a copy of exec.
+// If true then sets pDst to the destination register.
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
+{
+    enum {DST=0, SRC=1};
+    bool FoundCopy = false;
+    if (MI.getOpcode() == AMDGPU::COPY
+        || MI.getOpcode() == AMDGPU::S_MOV_B32
+        || MI.getOpcode() == AMDGPU::S_MOV_B64)
+    {
+        const MachineOperand &Src = MI.getOperand(SRC);
+        if (Src.isReg() && Src.getReg() == Exec)
+        {
+            FoundCopy = true;
+        }
+    }
+#if 0 // TODO: Delete this.
+    else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
+             MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
+    {
+        FoundCopy = true;
+    }
+#endif
+            
+    if (FoundCopy)
+    {
+        *pDst = MI.getOperand(DST).getReg();
+    }
+
+    return FoundCopy;
+}
+
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
+{
+    llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
+    if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
+    {
+        LiveLaneMask.Reg = MI->getOperand(0).getReg();
+        LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+    }
+
+    return LiveLaneMask;
+}
+
+MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
+{
+#if 0 // TODO: Get rid of this
+    // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
+    // This instruction is added by the SIWholeQuadMode pass.
+    MachineBasicBlock &MBB = MF.front();
+    for (MachineInstr &MI : MBB)
+    {
+        if (MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK ||
+            MI.getOpcode() == AMDGPU::AMDGPU_SET_LIVE_LANE_MASK_32)
+        {
+            return &MI;
+        }
+    }
+#endif
+
+    return nullptr;
+}
+
+bool llvm::IsFetchShaderCall(const MachineInstr *MI)
+{
+#if 0 // TODO: Get rid of this.
+    return 
+        MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
+        MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
+#else
+    return false;
+#endif
+}
+
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
+  const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+  for (auto it = MI; it != MBB->end(); ++it) {
+    const MachineInstr &CurMI = *it;
+    // Hit use of scc, it is live.
+    if (CurMI.readsRegister(AMDGPU::SCC, TRI))
+      return true;
+    // Hit def of scc first, not live.
+    if (CurMI.definesRegister(AMDGPU::SCC, TRI))
+      return false;
+  }
+  // Reach the end of MBB, check live-ins of MBB successors.
+  for (const MachineBasicBlock *Succ : MBB->successors()) {
+    if (Succ->isLiveIn(AMDGPU::SCC))
+      return true;
+  }
+  return false;
+}
+
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
+    MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo* TRI,
+    const SIInstrInfo* TII,
+    MachineRegisterInfo* MRI,
+    SccDefInsertPointConstraintFlags Constraints
+)
+{
+    // If SCC is dead at MI when we can use MI as the insert point.
+    if (!llvm::IsSccLiveAt(MBB, MI))
+    {
+        return MI;
+    }
+
+    const bool CheckForExecWrite =
+        Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+
+    // Get the starting reverse iterator taking care to handle the MBB->end() case.
+    MachineBasicBlock::reverse_iterator Start;
+    if (MI == MBB->end())
+    {
+        Start = MBB->rbegin();
+    }
+    else
+    {
+        Start = MI.getReverse();
+    }
+
+    // Otherwise, walk backwards through the block looking for a location where
+    // SCC is dead.
+    for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
+    {
+        // If the instruction modifies exec then we cannot use it as
+        // an insertion point (if that is a constraint from the caller).
+        // The check for EXEC works for both wave64 and wave32 because
+        // it will also catch writes to the subregisters (e.g. exec_lo).
+        if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
+        {
+            break;
+        }
+
+        if (It->modifiesRegister(AMDGPU::SCC, TRI) 
+            && !It->readsRegister(AMDGPU::SCC, TRI))
+        {
+            return It->getIterator();
+        }
+    }
+
+    // If no safe location can be found in the block we can save and restore
+    // SCC around MI. There is no way to directly read or write SCC so we use
+    // s_cselect to read the current value of SCC and s_cmp to write the saved
+    // value back to SCC.
+    //
+    // The generated code will look like this;
+    //
+    //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+    //      <----- Newly created safe insert point.
+    //      MI
+    //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+    //
+    unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+        .addImm(-1)
+        .addImm(0);
+    BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
+        .addReg(TmpScc, RegState::Kill)
+        .addImm(0);
+
+    return MI;
+}
+
+
+namespace {
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
+                    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!startMI || !endMI)
+    return false;
+  // is local when parent MBB the same.
+  bool bSameMBB = startMI->getParent() == endMI->getParent();
+  if (!bSameMBB)
+    return false;
+  // Collect touched MBB.
+  MachineBasicBlock *MBB = startMI->getParent();
+  touchedMBBSet.insert(MBB);
+  return true;
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes,
+                      SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes, touchedMBBSet))
+      return false;
+  }
+  return true;
+}
+
+bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes) {
+  MachineInstr *startMI = Indexes->getInstructionFromIndex(Seg->start);
+  MachineInstr *endMI = Indexes->getInstructionFromIndex(Seg->end);
+  // Treat non inst as not local.
+  if (!startMI || !endMI)
+    return false;
+  // is local when parent MBB the same.
+  return startMI->getParent() == endMI->getParent();
+}
+
+bool isLocalLiveRange(const LiveRange *Range, SlotIndexes *Indexes) {
+  for (const LiveRange::Segment &Seg : Range->segments) {
+    if (!isLocalSegment(&Seg, Indexes))
+      return false;
+  }
+  return true;
+}
+
+} // namespace
+
+// In case like float4 v, v.x used and defined in one block, v.y used and define
+// in another block, one live interval could touch more than one MBB.
+// touchedMBBSet is used for scheduling where local live interval could cross
+// multiple regions, need to calculate livereg for each region inside touched
+// MBB.
+bool llvm::isLocalLiveInterval(
+    const LiveInterval &LI, SlotIndexes *Indexes,
+    SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes, touchedMBBSet))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
+}
+
+
+bool llvm::isLocalLiveInterval(
+    const LiveInterval &LI, SlotIndexes *Indexes) {
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges()) {
+      if (!isLocalLiveRange(&S, Indexes))
+        return false;
+    }
+  }
+  return isLocalLiveRange(&LI, Indexes);
+}
+
+// This is used to speed up reg pressure calculation.
+// If instruction is moved, the cached liveset will be out of date.
+// Before instruction is moved, the value will be correct.
+void llvm::buildEndLiveMap(
+    llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+    const llvm::MachineRegisterInfo &MRI,
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+        &MBBLiveMap, bool After) {
+  // When only have one block, end live reg must be empty.
+  if (MF.size() == 1)
+    return;
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  DenseMap<MachineBasicBlock *, SlotIndex> MBBOutputSlotMap;
+  for (MachineBasicBlock &MBB : MF) {
+    auto BBEnd = MBB.rbegin();
+
+    // R.End doesn't point to the boundary instruction.
+    // Skip Debug instr.
+    if (llvm::GetNonDebugMBBEnd(BBEnd, MBB)) {
+      auto SI = SlotIndexes->getInstructionIndex(*BBEnd);
+      MBBOutputSlotMap[&MBB] = After ? SI.getDeadSlot() : SI.getBaseIndex();
+    }
+  }
+
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = Register::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+
+    LaneBitmask LiveMask;
+    const auto &LI = LIS->getInterval(Reg);
+
+    // Skip local live interval to make live input/ouput faster.
+    if (llvm::isLocalLiveInterval(LI, SlotIndexes))
+      continue;
+
+    for (auto outputIt : MBBOutputSlotMap) {
+      MachineBasicBlock *MBB = outputIt.first;
+      auto SI = outputIt.second;
+
+      auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI);
+      if (LiveMask.any())
+        MBBLiveMap[MBB][Reg] = LiveMask;
+    }
+  }
+}
+
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+  auto &MRI = MF.getRegInfo();
+  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      return SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::VGPR0) + 1;
+    }
+  }
+  return 0;
+}
+
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned MaxSGPR = 0;
+  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+      if (ScratchRSrcReg != 0) {
+        if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
+          continue;
+      }
+      MaxSGPR = SIRI->getHWRegIndex(Reg) - SIRI->getHWRegIndex(AMDGPU::SGPR0);
+      break;
+    }
+  }
+  return 1 + llvm::RegForVCC + MaxSGPR;
+}
+
+void llvm::dumpLiveSet(const LiveSet &LiveSet,
+                 const SIRegisterInfo *SIRI) {
+
+  dbgs() << "\n live set: \n";
+  for (auto it : LiveSet) {
+    int Reg = it.first;
+    dbgs() << printReg(Reg, SIRI);
+    if (it.second.any()) {
+      dbgs() << " mask:" << it.second.getAsInteger();
+    }
+    dbgs() << "\n";
+  }
+}
+
+// Test if all fast math flags of this Machine Instr are set. This allows
+// all non-strict floating-point transforms.
+bool llvm::isFastMathInst(llvm::MachineInstr &MI) {
+  // Follow the checks in isFast() in SelectionDAGNodes.h
+  return MI.getFlag(llvm::MachineInstr::MIFlag::FmNsz) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmArcp) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmNoNans) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmNoInfs) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmContract) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmAfn) &&
+         MI.getFlag(llvm::MachineInstr::MIFlag::FmReassoc);
+}
+#if 0
+bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
+{
+    switch (Stage)
+    {
+    case xmd::HwStage::PS:
+    case xmd::HwStage::CS:
+        return true;
+    default:
+        return false;
+    }
+}
+#endif
+
+MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
+{
+    for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
+    {
+        if (*It == Succ)
+        {
+            return It;
+        }
+    }
+
+    return MBB->succ_end();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
new file mode 100644
index 000000000000000..16b55c5c945835f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+class MachineFunction;
+class LiveIntervals;
+class LiveInterval;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineDominatorTree;
+class raw_ostream;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+typedef unsigned MachineReg;
+typedef unsigned MachineOpcode;
+
+constexpr unsigned RegForVCC = 2;
+constexpr unsigned VGPR_LIMIT = 256;
+// Post RA remat only try to help case when pressue is OK before RA but RA
+// result is higher. The diff should not be too much. So just use 4 as threshold
+// here.
+constexpr unsigned PostRARematThreshHold = 4;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
+                    const llvm::MachineRegisterInfo &MRI,
+                    const llvm::SIRegisterInfo *SIRI);
+void CollectLiveSetPressure(
+    const LiveSet &liveSet,
+    const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+    unsigned &VPressure, unsigned &SPressure);
+
+bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
+
+bool IsSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI);
+
+llvm::LaneBitmask getRegMask(const llvm::MachineOperand &MO,
+                             const llvm::MachineRegisterInfo &MRI);
+void andLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+void mergeLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet);
+llvm::MachineBasicBlock *split(llvm::MachineInstr *I);
+
+// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only
+// used 4 lanes.
+bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *TRI,
+                       const llvm::SIInstrInfo *TII,
+                       llvm::SlotIndexes *SlotIndexes);
+
+bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+                 llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
+                 llvm::MachineBasicBlock *ToBB);
+
+
+void viewCFGWithPhi(llvm::MachineFunction &MF);
+void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
+
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+
+bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
+                       llvm::MachineBasicBlock &MBB);
+
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+
+void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+                                 llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                 const llvm::MachineRegisterInfo *MRI);
+
+MachineReg CreateVirtualRegForOperand(
+    MachineOpcode Opcode,
+    unsigned Operand,
+    llvm::MachineFunction &MF
+);
+
+MachineReg CreateVirtualDstReg(
+    MachineOpcode Opcode,
+    llvm::MachineFunction &MF
+);
+
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+struct MachineRegWithSubReg {
+  MachineReg Reg = AMDGPU::NoRegister;
+  unsigned SubReg = AMDGPU::NoSubRegister;
+};
+MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
+llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
+
+// Return true if this machine instruction represents a call to the fetch shader.
+// We curently have two mechanisims for calling fetch shader:
+// 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
+// 2. A CALL instruction with the `FetchShaderCall` flag set to true.
+bool IsFetchShaderCall(const llvm::MachineInstr* MI);
+
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+
+
+// An enum used to pass additional constraints to
+// `FindOrCreateInsertionPointForSccDef()`. This will further
+// constrain the location where the scc def can be inserted.
+enum SccDefInsertPointConstraintFlags
+{
+    None        = 0,   // No additional constraints.
+    NoExecWrite = 1,   // Should be no modification of exec between BeforeInst and insert point.
+};
+
+// Look for a safe place to insert an instruction that defines scc.
+//
+//
+// This function is useful for when we need to insert a new
+// instruction that defines scc in a block and we need to find
+// a location that will not smash the existing value.
+//
+// Starting at `BeforeInst` it will look backwards to try to find
+// a place in the block where scc is dead so we can insert our new
+// def there. If no location can be found it will save and restore
+// scc around BeforeInst. This way BeforeInst can safely be used
+// as the new insert location.
+//
+llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
+    llvm::MachineBasicBlock* MBB,
+    llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo* TRI,
+    const llvm::SIInstrInfo* TII,
+    llvm::MachineRegisterInfo* MRI,
+    SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
+);
+
+// Check if LI live cross basic blocks, save all touched basic block if is
+// local.
+bool isLocalLiveInterval(
+    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
+    llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
+bool isLocalLiveInterval(
+    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+
+// build liveRegSet at end of each MBB.
+void buildEndLiveMap(
+    llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
+    const llvm::MachineRegisterInfo &MRI,
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
+        &MBBLiveMap, bool After);
+
+void dumpLiveSet(const LiveSet &LiveSet,
+                 const llvm::SIRegisterInfo *SIRI);
+
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+
+bool isFastMathInst(llvm::MachineInstr &MI);
+
+namespace pressure {
+void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
+               const llvm::SIRegisterInfo *SIRI,
+               llvm::raw_ostream &os);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+                    const char *Filename);
+void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
+                    llvm::raw_ostream &os);
+}
+// bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
+
+// Look for the successor `Succ` of the given `MBB`.
+// Returns MBB->succ_end() if `Succ` is not a successor of MBB.
+llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+
+// The enum and helper function for v_perm selection mask.
+//
+// The input byte layout of v_perm is as below: 
+//
+// BYTE in[8]
+// in[0] = $src1_BYTE0;
+// in[1] = $src1_BYTE1;
+// in[2] = $src1_BYTE2;
+// in[3] = $src1_BYTE3;
+// in[4] = $src0_BYTE0;
+// in[5] = $src0_BYTE1;
+// in[6] = $src0_BYTE2;
+// in[7] = $src0_BYTE3;
+//
+enum class V_PERM_IN_BYTE_POS {
+  src1_BYTE0 = 0,
+  src1_BYTE1,
+  src1_BYTE2,
+  src1_BYTE3,
+  src0_BYTE0,
+  src0_BYTE1,
+  src0_BYTE2,
+  src0_BYTE3
+};
+
+// The 4 arguments specify which input byte will be output
+// out[0] = Sel_0;
+// out[1] = Sel_1;
+// out[2] = Sel_2;
+// out[3] = Sel_3;
+//
+constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
+                                   V_PERM_IN_BYTE_POS Sel_1,
+                                   V_PERM_IN_BYTE_POS Sel_2,
+                                   V_PERM_IN_BYTE_POS Sel_3) {
+  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
+          ((int)Sel_1 << 8) | (int)Sel_0);
+}
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
new file mode 100644
index 000000000000000..ceb22b5ff9243dc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -0,0 +1,2767 @@
+//===- MirDivergenceAnalysis.cpp -- Mir Divergence Analysis Implementation -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/DivergenceAnalysis.cpp,
+// The most important difference is
+// introduction of the idea of "Bit-Divergence".
+//
+// The way booleans are represented in in AMD GPU is a 64-bit uint in a pair of
+// scalar registers, where each bit represents a boolean value for one lane. If
+// all active lanes have the same bool value (all 1's or all 0's), then we can
+// generate a scalar branch, otherwise we must use exec mask to selectively
+// execute lanes based on the boolean mask. When all values in a boolean mask
+// are the same for all active lanes, we call that mask "bit-uniform",
+// otherwise we call it "bit-divergent". This differs from the normal concept
+// of "uniform" and "divergent", which represents whether the value may be
+// different across the 64 lanes. A "bit-divergent" value is still "uniform" in
+// the sense that it is the same 64-bit value from the perspective of all the
+// lanes, but when used as branch condition, will cause the branch to be
+// divergent, which will cause the uses of any values outside of the control
+// flow region to be divergent.
+//
+// The original DA marks everything including bools as divergent or uniform
+// based on the propagation of divergent sources. However, booleans in AMDGPU
+// are in fact never "divergent". Comparison operations that receive divergent
+// operands instead produce "bit-divergent" or "bit-uniform" 64-bit booleans.
+// Between the definition of any boolean mask and its use (particularly in
+// branches, cndmasks, or anything that specifially consumes booleans), there
+// can be any arbitrary number and types of operations performed on it,
+// including combining it with other boolean masks via bit operations.
+//
+// The XDA algorithm is a modified version of the original DA algorithm to
+// simultaneously propagate regular divergence and bit-divergence.
+//
+// First off, XDA identifies all sources of divergence as well as
+// bit-divergence and adds them to the worklist. Then, just like with LLVM DA,
+// it pops values off of the worklist to propagate (bit-)divergence to all its
+// users, unless the user is always (bit-)uniform when given (bit-)divergent
+// operand. It's possible for a value to be marked as both divergent and
+// bit-divergent, in which case the regular divergence will trump
+// bit-divergence.
+//
+// The important difference in this propagation step is that there are special
+// instructions that when given bit-divergent operands, produce divergent
+// values and vice versa.
+//
+// An example is comparison:
+//
+// v0 = interp ...               ; divergent
+// v1 = interp ...               ; divergent
+// s[0:1] = v_cmp v0, v1         ; bit-divergent
+//
+// v0 and v1 are both divergent, but when propagating them, the v_cmp (and its
+// result) is bit-divergent value instead of divergent.
+//
+//
+// An example of the reverse:
+//
+// v0 = ...                                ; uniform
+// s[0:1] = v_cmp v0, v1                   ; bit-divergent
+// ...
+// branch s[0:1], label                    ; divergent!
+// ...
+// v1 = ...                                ; uniform
+// ...
+//
+// label:
+// v3 = phi v0, v1                         ; divergent! because of divergent branch.
+//
+// The boolean value is bit-divergent. When passed to the branch as an operand,
+// the branch becomes divergent, whose sync dependency will be computed as
+// normal to mark the appropriate values divergent (see description in normal
+// DA on how this works).
+//
+// Another difference is in MIR, some branch will be changed into exec update,
+// so only propagate control flow divergent on branch inst will not cover exec
+// control flow.
+// For case like
+//  %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// The PHI should be divergent when %40 is inside the ...
+// To propagate divergent from %168 to the PHI, need to start the propagate from
+// SAVEEXEC which is the control flow by update exec.
+//
+//
+// Original:
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines which branches and values in a loop or GPU
+// program are divergent. It can help branch optimizations such as jump
+// threading and loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they re-converge.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can interfere with thread re-convergence.
+// Therefore, an analysis that computes which branches in a GPU program are
+// divergent can help the compiler to selectively run these optimizations.
+//
+// This implementation is derived from the Vectorization Analysis of the
+// Region Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
+//
+// This DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+//    non-kernel-entry function and the return value of a function call as
+//    divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+//    generic or local address as divergent. This can be improved by leveraging
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMirDivergenceAnalysis.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "SIInstrInfo.h"
+//#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Support/Debug.h"
+//#include "newbe/cli/newbe_opts.h"  // AMDGPU change.
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-divergence-analysis"
+
+namespace llvm {
+bool isAMDGPUOpcodeDivergent(class MachineInstr *MI);
+}
+
+//
+// TODO: TableGen these
+//
+bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  // case R600::INTERP_LOAD_P0:
+  // case R600::INTERP_PAIR_XY:
+  // case R600::INTERP_PAIR_ZW:
+  // case R600::INTERP_VEC_LOAD:
+  // case R600::INTERP_XY:
+  // case R600::INTERP_ZW:
+  case AMDGPU::V_WRITELANE_B32:
+
+  case AMDGPU::V_INTERP_MOV_F32:
+  case AMDGPU::V_INTERP_MOV_F32_e64:
+  case AMDGPU::V_INTERP_MOV_F32_e64_vi:
+  case AMDGPU::V_INTERP_MOV_F32_si:
+  case AMDGPU::V_INTERP_MOV_F32_vi:
+  case AMDGPU::V_INTERP_P1LL_F16:
+  case AMDGPU::V_INTERP_P1LL_F16_vi:
+  case AMDGPU::V_INTERP_P1LV_F16:
+  case AMDGPU::V_INTERP_P1LV_F16_vi:
+  case AMDGPU::V_INTERP_P1_F32:
+  case AMDGPU::V_INTERP_P1_F32_16bank:
+  case AMDGPU::V_INTERP_P1_F32_16bank_si:
+  case AMDGPU::V_INTERP_P1_F32_16bank_vi:
+  case AMDGPU::V_INTERP_P1_F32_e64:
+  case AMDGPU::V_INTERP_P1_F32_e64_vi:
+  case AMDGPU::V_INTERP_P1_F32_si:
+  case AMDGPU::V_INTERP_P1_F32_vi:
+  case AMDGPU::V_INTERP_P2_F16:
+  case AMDGPU::V_INTERP_P2_F16_vi:
+  case AMDGPU::V_INTERP_P2_F32:
+  case AMDGPU::V_INTERP_P2_F32_e64:
+  case AMDGPU::V_INTERP_P2_F32_e64_vi:
+  case AMDGPU::V_INTERP_P2_F32_si:
+  case AMDGPU::V_INTERP_P2_F32_vi:
+
+  case AMDGPU::V_MBCNT_HI_U32_B32_e32:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e32_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx10:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_HI_U32_B32_e64_vi:
+  case AMDGPU::V_MBCNT_HI_U32_B32_sdwa:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e32:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e32_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx10:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_gfx6_gfx7:
+  case AMDGPU::V_MBCNT_LO_U32_B32_e64_vi:
+  case AMDGPU::V_MBCNT_LO_U32_B32_sdwa:
+
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_ADD_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_AND_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_DEC_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_INC_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_OR_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMAX_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SMIN_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SUB_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_SWAP_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMAX_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_UMIN_X2_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_OFFSET_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_ADDR64_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_BOTHEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_IDXEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFEN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_RTN_vi:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx10:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_gfx6_gfx7:
+  case AMDGPU::BUFFER_ATOMIC_XOR_X2_OFFSET_vi:
+
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_ADD_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_DEC_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_INC_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_OR_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMAX_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SMIN_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SUB_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_SWAP_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMAX_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_UMIN_V2_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V1_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V2_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V1_V4_vi:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_nsa_gfx10:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_si:
+  case AMDGPU::IMAGE_ATOMIC_XOR_V2_V4_vi:
+
+  case AMDGPU::SI_PS_LIVE:
+
+  case AMDGPU::DS_SWIZZLE_B32:
+  case AMDGPU::DS_SWIZZLE_B32_gfx10:
+  case AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7:
+  case AMDGPU::DS_SWIZZLE_B32_vi:
+
+    return true;
+
+  default:
+    break;
+  }
+  return false;
+}
+
+namespace {
+bool hasImmOperandWithVal(const MachineInstr *MI, uint16_t srcNameIdx,
+                          uint16_t srcModNameIdx, uint64_t Val) {
+  unsigned Op = MI->getOpcode();
+  unsigned srcIdx = AMDGPU::getNamedOperandIdx(Op, srcNameIdx);
+  if (srcIdx == -1)
+    return false;
+  const MachineOperand &srcMO = MI->getOperand(srcIdx);
+  if (srcMO.isImm() && srcMO.getImm() == Val) {
+
+    unsigned modIdx = AMDGPU::getNamedOperandIdx(Op, srcModNameIdx);
+    if (modIdx == -1)
+      return true;
+
+    const MachineOperand &modMO = MI->getOperand(modIdx);
+    if (modMO.getImm() == 0)
+      return true;
+  }
+  return false;
+}
+
+bool isConstant(const MachineInstr *MI) {
+  unsigned Op = MI->getOpcode();
+  switch (Op) {
+  default:
+    break;
+  case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64: {
+    // Check special case  or -1, which will get result -1.
+    const uint64_t kImm = -1;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_OR_B64: {
+    // Check special case  or -1, which will get result -1.
+    const uint64_t kImm = -1;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::S_AND_B64:
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64: {
+    // Check special case  and 0, which will get result 0.
+    const uint64_t kImm = 0;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src0,
+                             AMDGPU::OpName::src0_modifiers, kImm))
+      return true;
+    if (hasImmOperandWithVal(MI, AMDGPU::OpName::src1,
+                             AMDGPU::OpName::src1_modifiers, kImm))
+      return true;
+  } break;
+  }
+  return false;
+}
+
+bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
+                  const MachineRegisterInfo &MRI) {
+  const auto *BoolRC = SIRI->getBoolRC();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+        Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+      return true;
+
+    // Check if the written register class overlaps the bool register class.
+    //
+    // Note that this check is insufficent to catch all of the cases where
+    // a "bool" value could be created (for example writing to a register
+    // pair s[0:1], then using s0 as a bool value in wave32).
+    //
+    // The underlying problem is that we have two notions of divergence
+    // (bit divergence and wave divergence) but the algorithm only propagates
+    // wave divergence. The bit divergence is important for bools because it determines
+    // if a branch is uniform or not (and thus catches cases where a uniform value is
+    // used outside of a divergent control flow region). For bool values the
+    // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
+    // in order to try and propagate bit divergence.
+    //
+    // To fix all the possible bugs here I think we need to actually proagate bit
+    // divergence as well as wave divergences. That is a bigger fix and this check should
+    // cover most cases of treating a bool value as divergent.
+    const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
+    if (SIRI->getCommonSubClass(BoolRC, RC))
+      return true;
+  }
+  return false;
+}
+
+bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
+                       const SIRegisterInfo *SIRI,
+                       const MachineRegisterInfo &MRI) {
+  unsigned Op = MI->getOpcode();
+  switch (Op) {
+  default:
+    // Mark all s_inst always uniform except write to bool dst. This doesn't
+    // mean it is bit uniform. When check branch/exec region, will use
+    // isBitUniform. A bool might be sreg, but still divergent, since it is just
+    // put all lanes in one 64/32 bits sreg.
+    if (SIII->isScalarUnit(*MI) && !writeBoolDst(MI, SIRI, MRI) &&
+        !MI->isTerminator())
+      return true;
+    break;
+  //case AMDGPU::AMDGPU_MAKE_UNIFORM:
+  //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+  case AMDGPU::V_READFIRSTLANE_B32:
+  case AMDGPU::V_READLANE_B32:
+  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+    // bool readfirstlane should be 1 bit, which means bit uniform.
+    return true;
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_OR_B64: {
+    // Check special case  or -1, which will get result -1.
+    if (isConstant(MI))
+      return true;
+
+    return !writeBoolDst(MI, SIRI, MRI);
+  } break;
+  case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64: {
+    // Check special case  or -1, which will get result -1.
+    if (isConstant(MI))
+      return true;
+  } break;
+  case AMDGPU::S_AND_B32:
+  case AMDGPU::S_AND_B64: {
+    // Check special case  and 0, which will get result 0.
+    if (isConstant(MI))
+      return true;
+
+    return !writeBoolDst(MI, SIRI, MRI);
+  } break;
+  case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64: {
+    // Check special case  and 0, which will get result 0.
+    if (isConstant(MI))
+      return true;
+  } break;
+  }
+  return false;
+}
+
+bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
+  return reg.isPhysical();;
+}
+
+bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
+  return MRI.getRegClass(reg)->getID() == regClassID;
+}
+
+// For input reg of MF, vgpr will be divergent.
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+  if (isPhysicalReg(MRI, Reg)) {
+    unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
+    if (SIRI->isVGPR(MRI, vir_reg))
+      return true;
+  } else {
+   if (SIRI->isVGPR(MRI, Reg))
+      return true;
+  }
+  return false;
+}
+
+bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
+                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+  //  return true;
+  if (isAMDGPUOpcodeDivergent(MI))
+    return true;
+
+  if (isAlwaysUniformMI(MI, SIII, SIRI, MRI))
+    return false;
+
+  // If the instruction is neither guaranteed to
+  // be uniform or divergent, check whether any
+  // of its operands are passed in to the shader as
+  // args through vector regs.
+  //
+  // This makes them divergent.
+  for (MachineOperand &op : MI->operands()) {
+    if (!op.isReg())
+      continue;
+    if (op.isDef())
+      continue;
+    unsigned reg = op.getReg();
+    if (MRI.isLiveIn(reg)) {
+      if (isDivergentInputReg(reg, MRI, SIRI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+// For VCC, try to find the nearest define inside same BB.
+const MachineInstr *findPhysicalDefineInSameMBB(const MachineInstr *MI,
+                                                unsigned PhyReg) {
+  const MachineBasicBlock *MBB = MI->getParent();
+  auto it = MI->getReverseIterator();
+  for (it++; it != MBB->rend(); it++) {
+    const MachineInstr &TmpMI = *it;
+    for (const MachineOperand &DefMO : TmpMI.operands()) {
+      if (!DefMO.isReg())
+        continue;
+      if (DefMO.isUse())
+        continue;
+      if (DefMO.getReg() == PhyReg)
+        return &TmpMI;
+    }
+  }
+  return nullptr;
+}
+
+bool isWriteExec(const MachineInstr *MI) {
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == AMDGPU::EXEC ||
+        Reg == AMDGPU::EXEC_LO)
+      return true;
+  }
+  return false;
+}
+
+bool isVCndMask(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case AMDGPU::V_CNDMASK_B32_e32:
+  case AMDGPU::V_CNDMASK_B32_e64:
+  case AMDGPU::V_CNDMASK_B32_dpp:
+  case AMDGPU::V_CNDMASK_B32_sdwa:
+  case AMDGPU::V_CNDMASK_B64_PSEUDO:
+    return true;
+  }
+}
+
+
+bool isExecRegionOp(unsigned Op) {
+  switch (Op) {
+  default:
+    return false;
+  case AMDGPU::COPY:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+    return true;
+  }
+}
+
+bool isRestoreExec(const MachineInstr *MI) {
+  unsigned Op = MI->getOpcode();
+  if (!isExecRegionOp(Op))
+    return false;
+
+  return isWriteExec(MI);
+}
+
+const MachineInstr *
+findExecRegionBeginFromRegionEnd(const MachineInstr *MI,
+                                 const MachineRegisterInfo &MRI) {
+  const MachineOperand &MO = MI->getOperand(1);
+  if (!MO.isReg())
+    return nullptr;
+  unsigned Reg = MO.getReg();
+  const MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+  if (!Def)
+    return nullptr;
+  // Make sure the def is S_MOV Reg, Exec.
+  if (!isExecRegionOp(Def->getOpcode()))
+    return nullptr;
+  const MachineOperand &ExecMO = Def->getOperand(1);
+  if (!ExecMO.isReg())
+    return nullptr;
+  unsigned ExecReg = ExecMO.getReg();
+  if (ExecReg == AMDGPU::EXEC || ExecReg == AMDGPU::EXEC_LO)
+    return Def;
+  else
+    return nullptr;
+}
+
+bool isInsideExecRegion(const MachineInstr &MI, const MachineInstr &RegionBegin,
+                        const MachineInstr &RegionEnd,
+                        const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT) {
+  if (!DT.dominates(&RegionBegin, &MI))
+    return false;
+
+  const MachineBasicBlock *MBB = MI.getParent();
+  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+  if (MBB != RegionEndMBB) {
+    return PDT.dominates(RegionEndMBB, MBB);
+  } else {
+    // MachineLoop through the basic block until we find A or B.
+    MachineBasicBlock::const_iterator I = MBB->begin();
+    for (; I != MI.getIterator() && I != RegionEnd.getIterator(); ++I)
+      /*empty*/;
+
+    // RegionEnd post-dominates MI if MI is found first in the basic block.
+    return I == MI.getIterator();
+  }
+}
+
+bool isInsideExecRegion(const MachineBasicBlock &MBB,
+                        const MachineInstr &RegionBegin,
+                        const MachineInstr &RegionEnd,
+                        const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT) {
+  const MachineBasicBlock *RegionBeginMBB = RegionBegin.getParent();
+  const MachineBasicBlock *RegionEndMBB = RegionEnd.getParent();
+  if (!DT.dominates(RegionBeginMBB, &MBB))
+    return false;
+  return PDT.dominates(RegionEndMBB, &MBB);
+}
+
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
+// Then when hit saveExec, propagate leaked users of define inside the exec region.
+
+} // namespace
+
+namespace llvm {
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
+    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
+    SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+    // AMDGPU change begin.
+    DivergentJoinMapTy &JoinMap
+    // AMDGPU change end.
+    )
+    : F(F), MRI(F.getRegInfo()), RegionLoop(RegionLoop), DT(DT), PDT(PDT),
+      LI(LI), SDA(SDA), DivergentJoinMap(JoinMap), // AMDGPU change
+      IsLCSSAForm(IsLCSSAForm) {
+  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+  SIRI = ST->getRegisterInfo();
+  SIII = ST->getInstrInfo();
+}
+
+void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
+  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+  // AMDGPU change begin.
+  LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+             const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+             dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
+  //AMDGPU change end.
+  DivergentValues.insert(DivVal);
+}
+
+// Mir change.
+void DivergenceAnalysis::markDivergent(const MachineInstr &I) {
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    markDivergent(Reg);
+  }
+  DivergentInsts.insert(&I);
+}
+
+void DivergenceAnalysis::addUniformOverride(const ValueTy UniVal) {
+  // TODO: support uniform multi-def.
+  if (MRI.getUniqueVRegDef(UniVal) == nullptr)
+    return;
+
+  UniformOverrides.insert(UniVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const MachineInstr &I) {
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    addUniformOverride(Reg);
+  }
+  UniformOverridesInsts.insert(&I);
+}
+
+bool DivergenceAnalysis::isBitUniform(
+    const MachineInstr &I, const llvm::MachineOperand &UseMO,
+    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+  if (UseMO.isImm()) {
+    uint64_t val = UseMO.getImm();
+    // 0 and -1 are OK since all lanes are still the same.
+    if (val == 0 || val == -1)
+      return true;
+    else
+      return false;
+  }
+  if (!UseMO.isReg())
+    return true;
+  unsigned Reg = UseMO.getReg();
+  // Exec is always bituniform, because all active lanes are 1.
+  if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
+      // SCC only has 1 bit. Always bituniform.
+      Reg == AMDGPU::SCC)
+    return true;
+
+  const MachineInstr *UseMI = nullptr;
+  if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO) {
+    // Try to find define of this VCC.
+    UseMI = findPhysicalDefineInSameMBB(&I, Reg);
+  } else {
+    UseMI = MRI.getUniqueVRegDef(Reg);
+  }
+  if (!UseMI) {
+    return false;
+  }
+
+  bool bResult = isBitUniform(*UseMI, Processed);
+  Processed[UseMI] = bResult;
+  return bResult;
+}
+
+bool DivergenceAnalysis::isBitUniform(
+    const MachineInstr &I,
+    llvm::DenseMap<const MachineInstr *, bool> &Processed) const {
+  auto it = Processed.find(&I);
+  if (it != Processed.end())
+    return it->second;
+  // For branch on MIR, need to make sure all activi lanes are the same.
+  // cmp of uniform value will make sure all active lanes are the same.
+  // Imm is also the same for all active lanes.
+  if (isDivergent(I))
+    return false;
+  // Uniform cmp is bit uniform.
+  if (I.isCompare())
+    return true;
+  if (isConstant(&I))
+    return true;
+
+  // Conservatively consider bituniform to be false.
+  Processed[&I] = false;
+
+  // If all operand is bit uniform, then result is bit uniform.
+  bool bAllOperandBitUniform = true;
+  for (const MachineOperand &UseMO : I.uses()) {
+    if (isBitUniform(I, UseMO, Processed))
+      continue;
+    bAllOperandBitUniform = false;
+    break;
+  }
+  return bAllOperandBitUniform;
+}
+
+bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
+  if (Term.getParent()->succ_size() <= 1)
+    return false;
+  switch (Term.getOpcode()) {
+  default: {
+    if (updateNormalInstruction(Term))
+      return true;
+    llvm::DenseMap<const MachineInstr *, bool> Processed;
+    // Check bit uniform here if not divergent.
+    return !isBitUniform(Term, Processed);
+  }
+  //case AMDGPU::AMDGPU_CALL_INDIRECT:
+  case AMDGPU::SI_CALL:
+    return true;
+  }
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
+  // TODO function calls with side effects, etc
+  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+    return false;
+  if (DivergentInsts.find(&I) != DivergentInsts.end())
+    return true;
+  for (const auto &Op : I.uses()) {
+    if (!Op.isReg())
+      continue;
+    Register Reg = Op.getReg();
+    if (Reg.isPhysical()) {
+      if (Reg == AMDGPU::EXEC ||
+          Reg == AMDGPU::EXEC_LO ||
+          Reg == AMDGPU::SCC)
+        continue;
+      else 
+      if (const MachineInstr *DefMI =
+              findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+        if (isDivergent(*DefMI))
+          return true;
+      } else {
+        // If cannot find def in same MBB, just treat it as divergent.
+        return true;
+      }
+    } else {
+      if (isDivergent(Op.getReg()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+                                             const ValueTy Val,
+                                             const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+  const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+  const auto *Inst = MRI.getUniqueVRegDef(Val);
+  if (Inst == nullptr)
+    return true;
+  if (Inst)
+      DefBlock = Inst->getParent(); 
+
+  // check whether any divergent loop carrying Val terminates before control
+  // proceeds to ObservingBlock
+  for (const auto *MachineLoop = LI.getLoopFor(DefBlock); // AMDGPU change
+       MachineLoop != RegionLoop && !MachineLoop->contains(&ObservingBlock);
+       MachineLoop = MachineLoop->getParentLoop()) {
+    if (DivergentLoops.find(MachineLoop) != DivergentLoops.end())
+      return true;
+  }
+
+  return false;
+}
+
+// AMDGPU CHANGE BEGIN
+static bool HasIncomingUndefValue(const PHINode_ *Phi) {
+  for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi->getOperand(I);
+    if (Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
+// For case like
+//  %163:sreg_64_xexec = S_MOV_B64 $exec
+//bb.1:
+//; predecessors: %bb.1, %bb.0
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
+//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
+//  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//...
+//  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.1, implicit $exec
+// The ... code after SAVEEXEC will be divergent if %168 is divergent.
+// Return the SaveExec which affect MI.
+// If not exist, return nullptr.
+static const MachineInstr *
+findSaveExec(const MachineInstr *MI,
+             const SmallVector<const MachineInstr *, 2> &SaveExecs) {
+  // No save exec.
+  if (SaveExecs.empty())
+    return nullptr;
+  if (SaveExecs.size() > 1)
+    llvm::report_fatal_error(
+        "Not support case where, MBB has more than one SaveExec");
+  const MachineInstr *SaveExec = SaveExecs.front();
+  const MachineBasicBlock *MBB = SaveExec->getParent();
+  // Make sure MI is after SaveExec by check it is not before SaveExec.
+  // Assume MBB.begin to SaveExec is short here.
+  bool bIsAfterSaveExec = true;
+  for (auto it = MBB->begin(); it != SaveExec->getIterator(); it++) {
+    if (MI == it) {
+      bIsAfterSaveExec = false;
+      break;
+    }
+  }
+  // Not affect by save exec.
+  if (!bIsAfterSaveExec)
+    return nullptr;
+
+  return SaveExec;
+}
+
+// When a Phi's parent isJoinDivergent,the case make phi divergent is that 2
+// incoming values merged from different path of a divergent branch.
+// isJoinDivergentOnlyOnSameIncomingValue will check for all
+// combinations of incoming values except the BB with same incoming value,
+// because if values are same then even divergent branch is not divergent.
+// For example phi a:A, b:B, a:C.
+// It will check (A,B) (B,C) but not (A, C) Because A
+// and C has same value a.
+// If only (A,C) is sharing divergent branch,
+// then phi a:A, b:B, a:C is still uniform.
+// DivergentJoinMap saving MachineBasicBlock pairs which on different path of a
+// divergent branch and joined at one block.
+// For example,
+//    A
+//  /   \
+// |     \
+// |      \
+// B       /
+// | \    /
+// |  \  /
+// C   D
+// |   /
+//  \ /
+//   E
+// If A is uniform branch, B is divergent branch. Then only (C, D) will be saved
+// in DivergentJoinMap.
+// DivergentJoinMap is build with updateDisjointMap in
+// SyncDependenceAnalysis.cpp when SyncDependenceAnalysis::join_block is called.
+// It will only run on divergent branch, so (A, B) is not in
+// DivergentDisjointMap when A is uniform.
+static bool isJoinDivergentOnlyOnSameIncomingValue(
+    const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
+    DivergentJoinMapTy &DivergentJoinMap) {
+  // for phi which join divergent, if the incoming values from divergent
+  // branch are the same, the phi is still uniform.
+  // A
+  // | \
+  // |  \
+  // B   \
+  // |\   \
+  // | \   |
+  // C  D  E
+  // |  /  /
+  //  \/  /
+  //   \ /
+  //    F
+  // for phi in F like.
+  // phi (a:C, a:D, b:E)
+  // If A is uniform branch, B is non-uniform branch, phi is uniform.
+  SmallDenseSet<unsigned, 8> ValueToBlockMap;
+  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi.getOperand(I);
+    if (!Op.isReg())
+      continue;
+    unsigned Reg = Op.getReg();
+    if (pDA->isDivergent(Reg))
+      return false;
+
+    ValueToBlockMap.insert(Reg);
+  }
+  unsigned NumIncoming = (Phi.getNumOperands() - 1) / 2;
+  // When there's same incoming value from different incoming block.
+  // If divergent select is only on same value, then it is still uniform.
+  if (ValueToBlockMap.size() != NumIncoming) {
+    // When a phi is on divergent join block, there is incoming block which is
+    // comeing from different path of a divergent branch.
+    // Check all combination here.
+    for (unsigned i = 0; i < NumIncoming; i++) {
+      MachineBasicBlock *BB0 = Phi.getOperand(2 + 2 * i).getMBB();
+      const MachineOperand &MO0 = Phi.getOperand(1 + 2 * i);
+      for (unsigned j = i + 1; j < NumIncoming; j++) {
+        MachineBasicBlock *BB1 = Phi.getOperand(2 + 2 * j).getMBB();
+        const MachineOperand &MO1 = Phi.getOperand(1 + 2 * j);
+        // If value match, no divergent.
+        if (MO0.isImm() && MO1.isImm() && MO0.getImm() == MO1.getImm())
+          continue;
+        if (MO0.isReg() && MO1.isReg() && MO0.getReg() == MO1.getReg() &&
+            MO0.getSubReg() == MO1.getSubReg())
+          continue;
+
+        // If BB and BB2 is from divergent disjoint, then they will
+        // divergent join on phi.
+        // This is for case like
+        //    A
+        //  /   \
+        // |     \
+        // |      \
+        // B       /
+        // | \    /
+        // |  \  /
+        // C   D
+        // |   /
+        //  \ /
+        //   E
+        //
+        // phi(a:C, b:D)
+        // When nearestCommonDominator is A, but B also can be divergent
+        // disjoint for C and D.
+        if (DivergentJoinMap[BB0].count(BB1))
+          return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+// AMDGPU CHANGE END
+
+bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
+  // AMDGPU CHANGE BEGIN
+  // Do not mark phis with undef as incoming values as uniform.
+  // When promoting to scalar we will readfirstlane on
+  // the phi output. If some of the inputs are undef then
+  // this could replace a well defined vector value with an
+  // undefined scalar value.
+  if (HasIncomingUndefValue(&Phi))
+    return true;
+  // AMDGPU CHANGE END
+
+  // joining divergent disjoint path in Phi parent block
+  if (isJoinDivergent(*Phi.getParent())) {
+    // AMDGPU CHANGE BEGIN
+    if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+      // Continue if the divergent join only on same incoming value.
+      if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
+                                                  DivergentJoinMap))
+        return true;
+    } else
+    // AMDGPU CHANGE END
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i % thread_id == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+    const MachineOperand &Op = Phi.getOperand(I);
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    const MachineOperand &BB = Phi.getOperand(I + 1);
+    if (isDivergent(Reg) ||
+        isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
+      return true;
+
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updateVCndMask(const MachineInstr &VCndMask) const {
+  // VCndMask require the Cond bituniform to be uniform.
+  unsigned Op = VCndMask.getOpcode();
+  unsigned src0Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src0);
+  unsigned src1Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src1);
+  unsigned src2Idx = AMDGPU::getNamedOperandIdx(Op, AMDGPU::OpName::src2);
+
+  const MachineOperand &src0 = VCndMask.getOperand(src0Idx);
+  const MachineOperand &src1 = VCndMask.getOperand(src1Idx);
+
+  const MachineOperand &cond = VCndMask.getOperand(src2Idx);
+
+  if (isDivergent(src0))
+    return true;
+
+  // If src0 == src1, then return src0 divergent.
+  if (src0.isReg() && src1.isReg() && src0.getReg() == src1.getReg()) {
+    if (src0.getSubReg() == src1.getSubReg() &&
+        SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src0_modifiers) ==
+            SIII->hasModifiersSet(VCndMask, AMDGPU::OpName::src1_modifiers))
+      return false;
+  }
+
+  if (isDivergent(src1))
+    return true;
+
+  llvm::DenseMap<const MachineInstr *, bool> Processed;
+  return !isBitUniform(VCndMask, cond, Processed);
+}
+
+bool DivergenceAnalysis::inRegion(const MachineInstr &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
+  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by LoopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+  auto *DivLoop = LI.getLoopFor(&LoopHeader);
+  assert(DivLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<MachineBasicBlock *, 8> TaintStack;
+  DivLoop->getExitBlocks(TaintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of DivLoop (including its fringes for phi nodes)
+  DenseSet<const MachineBasicBlock *> Visited;
+  for (auto *Block : TaintStack) {
+    Visited.insert(Block);
+  }
+  Visited.insert(&LoopHeader);
+
+  while (!TaintStack.empty()) {
+    auto *UserBlock = TaintStack.back();
+    TaintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*UserBlock))
+      continue;
+
+    assert(!DivLoop->contains(UserBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&LoopHeader, UserBlock)) {
+      // all PHI nodes of UserBlock become divergent
+      pushPHINodes(*UserBlock);
+      continue;
+    }
+
+    // taint outside users of values carried by DivLoop
+    for (auto &I : *UserBlock) {
+      if (isAlwaysUniformMI(&I, SIII, SIRI, MRI))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.uses()) {
+        if (!Op.isReg())
+          continue;
+        unsigned OpReg = Op.getReg();
+        MachineInstr *OpInst = MRI.getUniqueVRegDef(OpReg);
+        if (!OpInst)
+          continue;
+        if (DivLoop->contains(OpInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *SuccBlock : UserBlock->successors()) {
+      if (!Visited.insert(SuccBlock).second) {
+        continue;
+      }
+      TaintStack.push_back(SuccBlock);
+    }
+  }
+}
+
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { 
+  Worklist.push_back(&I);
+}
+void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
+  for (const auto &Phi : Block.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    pushInstruction(Phi);
+  }
+}
+
+void DivergenceAnalysis::pushUsers(const ValueTy V) {
+  for (const auto &UserInst : MRI.use_nodbg_instructions(V)) {
+
+    if (isDivergent(UserInst))
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(UserInst))
+      continue;
+
+    Worklist.push_back(&UserInst);
+  }
+}
+void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
+  for (const auto &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    pushUsers(Reg);
+  }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+                                                 const MachineLoop *BranchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(JoinBlock)) {
+    return false;
+  }
+
+  // push non-divergent phi nodes in JoinBlock to the worklist
+  pushPHINodes(JoinBlock);
+
+  // JoinBlock is a divergent loop exit
+  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+    return true;
+  }
+
+  // disjoint-paths divergent at JoinBlock
+  markBlockJoinDivergent(JoinBlock);
+  return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+  markDivergent(Term);
+
+  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from Term within the loop
+  // also iterates over loop exits that become divergent due to Term.
+  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent loop due to the divergent branch in Term
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*ExitingLoop.getHeader()))
+    return;
+
+  const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
+  // become divergent.
+  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+// For case like
+//  %149:sreg_64_xexec = S_MOV_B64 $exec
+//
+//bb.3:
+//; predecessors: %bb.3, %bb.2
+//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+//
+//  %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
+//  %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
+//  %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
+//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  $m0 = S_MOV_B32 %153:sgpr_32
+//  %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
+//  $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+//  S_CBRANCH_EXECNZ %bb.3, implicit $exec
+//
+//bb.4:
+//; predecessors: %bb.3
+//  successors: %bb.5(0x80000000); %bb.5(100.00%)
+//
+//  $exec = S_MOV_B64 %149:sreg_64_xexec
+
+// bb.3 is inside exec region which exec is saved by %149.
+// %152:sreg_64 = S_AND_SAVEEXEC_B64 will update the exec which cause divergence
+// when it is not bituniform. Everything inside the exec region need to be
+// scaned. Out region or phi use should be marked as divergent and add users to
+// worklist.
+void DivergenceAnalysis::propagateExecControlFlowDivergence(
+    const MachineInstr &SaveExec) {
+  const MachineBasicBlock *MBB = SaveExec.getParent();
+  auto it = ExecRegionMap.find(MBB);
+  if (it == ExecRegionMap.end())
+    return;
+  ExecRegion &Region = *it->second;
+  // One region only need to propagate once.
+  if (Region.bPropagated)
+    return;
+  Region.bPropagated = true;
+  // Scan all MIs in the region. Mark out region or phi use as divergent and add
+  // their users to worklist.
+  auto propagateExecDivergence = [this, Region](const MachineInstr *MI) {
+    for (const auto &DstMO : MI->defs()) {
+      Register Reg = DstMO.getReg();
+      // Only VCC/Exec/m0.
+      // Exec always uniform. Assume VCC and m0 not cross region.
+      if (Reg.isPhysical())
+        continue;
+      for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+        if (isDivergent(UserInst))
+          continue;
+
+        // only propagate user outside of region or phi which will not be
+        // guarded by saveExec.
+        if (UserInst.getOpcode() != AMDGPU::PHI &&
+            isInsideExecRegion(UserInst, *Region.begin, *Region.end, DT, PDT)) {
+          continue;
+        }
+        // Write exec is not divergent.
+        if (isWriteExec(&UserInst))
+          continue;
+
+        markDivergent(UserInst);
+        pushUsers(UserInst);
+      }
+    }
+  };
+  const MachineBasicBlock *RegionBeginMBB = Region.begin->getParent();
+  const MachineBasicBlock *RegionEndMBB = Region.end->getParent();
+  if (RegionBeginMBB != RegionEndMBB) {
+    auto it = Region.begin->getIterator();
+    for (it++; it != RegionBeginMBB->end(); it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+
+    // All blocks between RegionBeginMBB and RegionEndMBB.
+    for (const MachineBasicBlock *MBB : Region.blocks) {
+      for (const MachineInstr &MI : *MBB) {
+        propagateExecDivergence(&MI);
+      }
+    }
+
+    for (auto it = RegionEndMBB->begin(); it != Region.end->getIterator();
+         it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+
+  } else {
+    auto it = Region.begin->getIterator();
+    for (it++; it != Region.end->getIterator(); it++) {
+      const MachineInstr &MI = *it;
+      propagateExecDivergence(&MI);
+    }
+  }
+}
+
+void DivergenceAnalysis::compute() {
+  SmallVector<ExecRegion, 4> ExecRegions;
+  // Build exec regions.
+  // Add VCndMask for non-bituniform caused by input sreg.
+  for (const MachineBasicBlock &MBB : F) {
+    for (const MachineInstr &Term : MBB.terminators()) {
+      if (updateTerminator(Term))
+        pushInstruction(Term);
+    }
+
+    for (const MachineInstr &I : MBB) {
+      unsigned Opcode = I.getOpcode();
+      if (isVCndMask(Opcode)) {
+        // Cond for CndMask needs bit uniform check.
+        // Add it to worklist to check bit uniform from input.
+        pushInstruction(I);
+      } else if (isRestoreExec(&I)) {
+        const MachineInstr *RegionBegin =
+            findExecRegionBeginFromRegionEnd(&I, MRI);
+        if (RegionBegin) {
+          ExecRegions.emplace_back(ExecRegion(RegionBegin, &I));
+        }
+      }
+    }
+  }
+
+  // Build exec region map.
+  for (const MachineBasicBlock &MBB : F) {
+    for (ExecRegion &Region : ExecRegions) {
+      if (isInsideExecRegion(MBB, *Region.begin, *Region.end, DT, PDT)) {
+        // Add block to region.
+        if (&MBB != Region.begin->getParent() &&
+            &MBB != Region.end->getParent())
+          Region.blocks.emplace_back(&MBB);
+        // Update ExecRegionMap.
+        auto it = ExecRegionMap.find(&MBB);
+        if (it == ExecRegionMap.end()) {
+          ExecRegionMap[&MBB] = &Region;
+        } else {
+          // When MBB inside multiple regions, save the smallest one.
+          if (isInsideExecRegion(*Region.begin, *it->second->begin,
+                                 *it->second->end, DT, PDT)) {
+            ExecRegionMap[&MBB] = &Region;
+          }
+        }
+      }
+    }
+  }
+
+  for (auto DivVal : DivergentValues) {
+    LLVM_DEBUG(dbgs() << "\t sourceOfDivergence :"; printReg(DivVal, SIRI);
+               dbgs() << "\n";);
+    pushUsers(DivVal);
+  }
+
+  // propagate divergence
+  while (!Worklist.empty()) {
+    const MachineInstr *I= Worklist.back();
+    Worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniformMI(I, SIII, SIRI, MRI)) {
+      // If used by terminators, and not bit uniform.
+      // Add terminator.
+      SmallVector<const MachineInstr *, 2> TermUsers;
+      for (const auto &DstMO : I->defs()) {
+        unsigned Reg = DstMO.getReg();
+        for (const auto &UserInst : MRI.use_nodbg_instructions(Reg)) {
+
+          if (isDivergent(UserInst))
+            continue;
+          // Only check terminator here.
+          if (!UserInst.isTerminator())
+            continue;
+
+          // only compute divergent inside loop
+          if (!inRegion(UserInst))
+            continue;
+
+          TermUsers.emplace_back(&UserInst);
+        }
+      }
+
+      if (!TermUsers.empty()) {
+        llvm::DenseMap<const MachineInstr *, bool> Processed;
+        if (!isBitUniform(*I, Processed)) {
+          for (const MachineInstr *Term : TermUsers) {
+            Worklist.emplace_back(Term);
+          }
+        }
+      }
+
+      continue;
+    }
+
+    bool WasDivergent = isDivergent(*I);
+    if (WasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (I->isTerminator()) {
+      if (updateTerminator(*I)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(*I);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool DivergentUpd = false;
+    unsigned Opcode = I->getOpcode();
+    switch (I->getOpcode()) {
+    default:
+      if (isVCndMask(Opcode)) {
+        DivergentUpd = updateVCndMask(*I);
+      } else {
+        DivergentUpd = updateNormalInstruction(*I);
+        llvm::DenseMap<const MachineInstr *, bool> Processed;
+        if ((DivergentUpd || !isBitUniform(*I, Processed)) && isWriteExec(I)) {
+          // propagate exec control divergence to affected instructions.
+          propagateExecControlFlowDivergence(*I);
+        }
+      }
+      break;
+    case AMDGPU::PHI:
+      DivergentUpd = updatePHINode(*I);
+      break;
+    }
+
+    // propagate value divergence to users
+    if (DivergentUpd) {
+      markDivergent(*I);
+      pushUsers(*I);
+    }
+  }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const ValueTy V) const {
+  return UniformOverrides.find(V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const ValueTy V) const {
+  return DivergentValues.find(V) != DivergentValues.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+  Register Reg = MO.getReg();
+  if (Reg.isPhysical()) {
+    const MachineInstr *MI = MO.getParent();
+    if (MI)
+      return isDivergent(!MI);
+
+  } else {
+    return isDivergent(Reg);
+  }
+  return true;
+}
+
+bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
+  if (UniformOverridesInsts.find(&I) != UniformOverridesInsts.end())
+    return false;
+  if (DivergentInsts.find(&I) != DivergentInsts.end())
+    return true;
+  for (const MachineOperand &DstMO : I.defs()) {
+    unsigned Reg = DstMO.getReg();
+    if (isDivergent(Reg))
+      return true;
+  }
+  return false;
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &MBB : F)
+  for (auto &I : MBB) {
+    if (isDivergent(I))
+      OS << "DIVERGENT:" << I ;
+    // AMDGPU changes begin
+    else
+      OS << "UNIFORM:" << I ;
+    // AMDGPU changes end
+  }
+}
+
+// class GPUDivergenceAnalysis
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
+                                             const MachineDominatorTree &DT,
+                                             const MachinePostDominatorTree &PDT,
+                                             const MachineLoopInfo &LI)
+    : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
+      DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+  MachineRegisterInfo &MRI = F.getRegInfo();
+  const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *SIRI = ST->getRegisterInfo();
+  const SIInstrInfo *SIII = ST->getInstrInfo();
+  for (auto &MBB : F)
+    for (auto &I : MBB) {
+      if (isSourceOfDivergence(&I, MRI, SIRI, SIII)) {
+        DA.markDivergent(I);
+      } else if (isAlwaysUniformMI(&I, SIII, SIRI, MRI)) {
+        DA.addUniformOverride(I);
+      }
+    }
+  for (auto &ArgIt : F.getRegInfo().liveins()) {
+    unsigned Reg = ArgIt.first;
+    if (isDivergentInputReg(Reg, MRI, SIRI)) {
+      DA.markDivergent(Reg);
+    }
+  }
+
+  DA.compute();
+}
+
+bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
+  return DA.isDivergent(*I);
+}
+
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
+}
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
new file mode 100644
index 000000000000000..edcf96ec44a4d59
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -0,0 +1,281 @@
+//===- AMDGPUMirDivergenceAnalysis.h -        Mir Divergence Analysis -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+class TargetTransformInfo;
+class MachineRegisterInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class MachineOperand;
+class MachineBasicBlock;
+
+using Module_ = void;
+class TargetTransformInfo;
+using ValueTy = unsigned;
+using PHINode_ = MachineInstr;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+  /// \brief This instance will analyze the whole function \p F or the loop \p
+  /// RegionLoop.
+  ///
+  /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
+  /// Otherwise the whole function is analyzed.
+  /// \param IsLCSSAForm whether the analysis may assume that the IR in the
+  /// region in in LCSSA form.
+  DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
+                     const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+                     const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
+                     bool IsLCSSAForm,
+                     // AMDGPU change begin.
+                     DivergentJoinMapTy &JoinMap
+                     // AMDGPU change end.
+  );
+
+  /// \brief The loop that defines the analyzed region (if any).
+  const MachineLoop *getRegionLoop() const { return RegionLoop; }
+  const llvm::MachineFunction &getFunction() const { return F; }
+
+  /// \brief Whether \p BB is part of the region.
+  bool inRegion(const MachineBasicBlock &BB) const;
+  /// \brief Whether \p I is part of the region.
+  bool inRegion(const MachineInstr &I) const;
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const ValueTy UniVal);
+  void addUniformOverride(const MachineInstr &I);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  void markDivergent(const ValueTy DivVal);
+  void markDivergent(const MachineInstr &I);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const ValueTy Val) const;
+
+  /// \brief Whether \p Val is a divergent value
+  bool isDivergent(const ValueTy Val) const;
+  bool isDivergent(const MachineInstr &I) const;
+
+  void print(llvm::raw_ostream &OS, const Module_ *) const;
+
+private:
+  bool isDivergent(const llvm::MachineOperand &MO) const;
+  bool updateTerminator(const MachineInstr &Term) const;
+  bool updatePHINode(const PHINode_ &Phi) const;
+  bool updateVCndMask(const MachineInstr &VCndMask) const;
+  bool isBitUniform(const MachineInstr &I,
+                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+
+  /// \brief Computes whether \p Inst is divergent based on the
+  /// divergence of its operands.
+  ///
+  /// \returns Whether \p Inst is divergent.
+  ///
+  /// This should only be called for non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const MachineInstr &Inst) const;
+
+  /// \brief Mark users of live-out users as divergent.
+  ///
+  /// \param LoopHeader the header of the divergent loop.
+  ///
+  /// Marks all users of live-out values of the loop headed by \p LoopHeader
+  /// as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const MachineBasicBlock &LoopHeader);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist
+  void pushUsers(const ValueTy I);
+  void pushUsers(const MachineInstr &I);
+
+  void pushInstruction(const MachineInstr &I);
+  /// \brief Push all phi nodes in @block to the worklist
+  void pushPHINodes(const MachineBasicBlock &Block);
+
+  /// \brief Mark \p Block as join divergent
+  ///
+  /// A block is join divergent if two threads may reach it from different
+  /// incoming blocks at the same time.
+  void markBlockJoinDivergent(const MachineBasicBlock &Block) {
+    DivergentJoinBlocks.insert(&Block);
+  }
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
+                           const ValueTy Val,
+                           const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+
+  /// \brief Whether \p Block is join divergent
+  ///
+  /// (see markBlockJoinDivergent).
+  bool isJoinDivergent(const MachineBasicBlock &Block) const {
+    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+  }
+
+  /// \brief Propagate control-induced divergence to users (phi nodes and
+  /// instructions).
+  //
+  // \param JoinBlock is a divergent loop exit or join point of two disjoint
+  // paths.
+  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+  bool propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
+                               const MachineLoop *TermLoop);
+
+  /// \brief Propagate induced value divergence due to control divergence in \p
+  /// Term.
+  void propagateBranchDivergence(const MachineInstr &Term);
+
+  /// \brief Propagate induced value divergence due to exec update caused by \p
+  /// SaveExec.
+  void propagateExecControlFlowDivergence(const MachineInstr &SaveExec);
+
+  /// \brief Propagate divergent caused by a divergent loop exit.
+  ///
+  /// \param ExitingLoop is a divergent loop.
+  void propagateLoopDivergence(const MachineLoop &ExitingLoop);
+
+private:
+  const llvm::MachineFunction &F;
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+  const llvm::SIInstrInfo *SIII;
+  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+  // Otw, analyze the whole function
+  const MachineLoop *RegionLoop;
+
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+
+  // Recognized divergent loops
+  llvm::DenseSet<const MachineLoop *> DivergentLoops;
+
+  // AMDGPU change begin
+  // Save block pair which divergent disjoint.
+  // A
+  // | \
+  // |  \
+  // B   C
+  // |  /
+  //  D
+  // When A is divergent branch, B and C are divergent join at D.
+  // Then DivergentJoinMap[B].count(C) > 0 and
+  // DivergentJoinMap[C].count(B) > 0.
+  DivergentJoinMapTy &DivergentJoinMap;
+  // AMDGPU change end
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysis &SDA;
+
+  // Use simplified code path for LCSSA form.
+  bool IsLCSSAForm;
+
+  // Set of known-uniform values.
+  llvm::DenseSet<unsigned> UniformOverrides;
+  llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+
+  // Blocks with joining divergent control from different predecessors.
+  llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
+
+  // Detected/marked divergent values.
+  llvm::DenseSet<unsigned> DivergentValues;
+  llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+
+  // Mir change for EXEC control flow.
+  // Map from MBB to the exec region it belongs too.
+  // A exec region is begin with
+  // S_MOV_B64 sreg, exec
+  // end with
+  // S_MOV_B64 exec, sreg
+  // Inside the region, exec might be updated to make control flow with exec.
+  struct ExecRegion {
+    const llvm::MachineInstr *begin;
+    const llvm::MachineInstr *end;
+    std::vector<const llvm::MachineBasicBlock*> blocks;
+    bool bPropagated = false;
+    ExecRegion(const llvm::MachineInstr *b,
+               const llvm::MachineInstr *e)
+        : begin(b), end(e), bPropagated(false) {}
+  };
+  llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const llvm::MachineInstr*> Worklist;
+};
+
+/// \brief Divergence analysis frontend for GPU kernels.
+class MirGPUDivergenceAnalysis {
+  // AMDGPU change begin
+  // Save block pair which divergent disjoint.
+  // A
+  // | \
+  // |  \
+  // B   C
+  // |  /
+  //  D
+  // When A is divergent branch, B and C are divergent join at D.
+  // Then DivergentJoinMap[B].count(C) > 0 and
+  // DivergentJoinMap[C].count(B) > 0.
+  DivergentJoinMapTy  DivergentJoinMap;
+  // AMDGPU change end
+  SyncDependenceAnalysis SDA;
+  DivergenceAnalysis DA;
+
+public:
+  /// Runs the divergence analysis on @F, a GPU kernel
+  MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
+                        const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+
+  /// Whether any divergence was detected.
+  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+  /// The GPU kernel this analysis result is for
+  const llvm::MachineFunction &getFunction() const { return DA.getFunction(); }
+
+  /// Whether \p I is divergent.
+  bool isDivergent(const MachineInstr *I) const;
+
+  /// Whether \p I is uniform/non-divergent
+  bool isUniform(const MachineInstr *I) const { return !isDivergent(I); }
+
+  /// Print all divergent values in the kernel.
+  void print(llvm::raw_ostream &OS, const Module_ *) const;
+};
+
+} // namespace llvm
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
new file mode 100644
index 000000000000000..7213f7b4b11b4c6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -0,0 +1,511 @@
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is based on Analysis/MirSyncDependenceAnalysis.cpp, just change
+// MachineBasicBlock to MachineBasicBlock.
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
+                                               const MachinePostDominatorTree &PDT,
+                                               const MachineLoopInfo &LI,
+                                               // AMDGPU change begin.
+                                               DivergentJoinMapTy &JoinMap
+                                               // AMDGPU change end.
+    )
+    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
+    // AMDGPU change begin.
+      DivergentJoinMap(JoinMap)
+    // AMDGPU change end.
+{
+}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const MachineFunction *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &FuncRPOT;
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+
+  // identified join points
+  std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const MachineBasicBlock *, 4> ReachedLoopExits;
+
+  // if DefMap[B] == C then C is the dominating definition at block B
+  // if DefMap[B] ~ undef then we haven't seen B yet
+  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+  // an immediate successor of X (initial value).
+  using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+  DefiningBlockMap DefMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const MachineBasicBlock *> PendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
+                       const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+        JoinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+    if (WasAdded)
+      PendingUpdates.insert(&Block);
+  }
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::DefMap {\n";
+    for (const auto *Block : FuncRPOT) {
+      auto It = DefMap.find(Block);
+      Out << Block->getName() << " : ";
+      if (It == DefMap.end()) {
+        Out << "\n";
+      } else {
+        const auto *DefBlock = It->second;
+        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+                      const MachineBasicBlock &DefBlock) {
+
+    // @succBlock is a loop exit
+    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+      DefMap.emplace(&SuccBlock, &DefBlock);
+      ReachedLoopExits.insert(&SuccBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto ItLastDef = DefMap.find(&SuccBlock);
+    if (ItLastDef == DefMap.end()) {
+      addPending(SuccBlock, DefBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (ItLastDef->second != &DefBlock) {
+      // do we know this join already?
+      if (!JoinBlocks->insert(&SuccBlock).second)
+        return;
+
+      // update the definition
+      addPending(SuccBlock, SuccBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent terminators and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  std::unique_ptr<ConstBlockSet>
+  computeJoinPoints(const MachineBasicBlock &RootBlock,
+                    SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+    assert(JoinBlocks);
+
+    // bootstrap with branch targets
+    for (const auto *SuccBlock : NodeSuccessors) {
+      DefMap.emplace(SuccBlock, SuccBlock);
+
+      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+        // immediate loop exit from node.
+        ReachedLoopExits.insert(SuccBlock);
+        continue;
+      } else {
+        // regular successor
+        PendingUpdates.insert(SuccBlock);
+      }
+    }
+
+    auto ItBeginRPO = FuncRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+
+    auto ItEndRPO = FuncRPOT.end();
+    assert(ItBeginRPO != ItEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto ItBlockRPO = ItBeginRPO;
+    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+      const auto *Block = *ItBlockRPO;
+
+      // skip @block if not pending update
+      auto ItPending = PendingUpdates.find(Block);
+      if (ItPending == PendingUpdates.end())
+        continue;
+      PendingUpdates.erase(ItPending);
+
+      // propagate definition at @block to its successors
+      auto ItDef = DefMap.find(Block);
+      const auto *DefBlock = ItDef->second;
+      assert(DefBlock);
+
+      auto *BlockLoop = LI.getLoopFor(Block);
+      if (ParentLoop &&
+          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<MachineBasicBlock *, 4> BlockLoopExits;
+        BlockLoop->getExitBlocks(BlockLoopExits);
+        for (const auto *BlockLoopExit : BlockLoopExits) {
+          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *SuccBlock : Block->successors()) {
+          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const MachineBasicBlock *ParentLoopHeader =
+        ParentLoop ? ParentLoop->getHeader() : nullptr;
+    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!ReachedLoopExits.empty()) {
+      assert(ParentLoop);
+      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+
+      // AMDGPU CHANGE: Allow null HeaderDefBlock
+      // Because of the way they walk the blocks (a reverse post order traversal
+      // stopping at the immediate post dominator) it is possible that
+      // they will reach a loop exit, but not the loop header.
+      //
+      // We conservatively mark the exit blocks as divergent join points
+      // in this case.
+      //
+      // Problem CFG is below:
+      //
+      //     +--> A
+      //     |   / \
+      //     |  B   C
+      //     |  | / |
+      //     +--L   P
+      //   
+      // In this cfg, C is the RootBlock and P is C's post-dominator.
+      // It will only visit L and P and then stop because it hits the
+      // post dominator. Most loops do not hit this case because the
+      // loop exiting block (C) will branch directly back to the loop
+      // header.
+      // 
+      if (HeaderDefBlock)
+      {
+          for (const auto *ExitBlock : ReachedLoopExits) {
+            auto ItExitDef = DefMap.find(ExitBlock);
+            assert((ItExitDef != DefMap.end()) &&
+                   "no reaching def at reachable loop exit");
+            if (ItExitDef->second != HeaderDefBlock) {
+              JoinBlocks->insert(ExitBlock);
+            }
+          }
+      }
+      else
+      {
+          for (const auto *ExitBlock : ReachedLoopExits)
+          {
+              JoinBlocks->insert(ExitBlock);
+          }
+      }
+    }
+
+    return std::move(JoinBlocks);
+  }
+};
+
+// AMDGPU change begin.
+// For all join blocks caused by divergent RootBlock, the prevs of a join block
+// which are in DefMap or the RootBlock are divergent join each other on the join block because
+// of divergent RootBlock.
+static void updateJoinMap(
+    const MachineBasicBlock *RootBlock,
+    DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+    DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+  for (const MachineBasicBlock *JoinBB : JoinBlocks) {
+    // makr divergent join for all pred pair which in DefMap.
+    for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
+         predIt++) {
+      auto predIt2 = predIt;
+      const MachineBasicBlock *pred = *predIt;
+      if (DefMap.count(pred) == 0 && pred != RootBlock)
+        continue;
+
+      for (predIt2++; predIt2 != JoinBB->pred_end(); predIt2++) {
+        const MachineBasicBlock *pred2 = *predIt2;
+        if (DefMap.count(pred2) == 0 && pred2 != RootBlock)
+          continue;
+
+        JoinMap[pred].insert(pred2);
+        JoinMap[pred2].insert(pred);
+        LLVM_DEBUG(dbgs() << "joint_bb0: " << pred->getName()
+                          << " joint_bb1: " << pred2->getName() << "\n";);
+      }
+    }
+  }
+}
+// AMDGPU change end.
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+  using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
+  LoopExitVec LoopExits;
+  MachineLoop.getExitBlocks(LoopExits);
+  if (LoopExits.size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedLoopExitJoins.find(&MachineLoop);
+  if (ItCached != CachedLoopExitJoins.end()) {
+    return *ItCached->second;
+  }
+
+  // dont propagte beyond the immediate post dom of the loop
+  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+  const auto *IpdNode = PdNode->getIDom();
+  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
+    IpdNode = IpdNode->getIDom();
+    PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  }
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+
+  // AMDGPU change begin.
+  // Save divergent join pairs.
+  updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
+                    *JoinBlocks.get());
+  // AMDGPU change end.
+
+  auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
+  // trivial case
+  if (Term.getParent()->succ_size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedBranchJoins.find(&Term);
+  if (ItCached != CachedBranchJoins.end())
+    return *ItCached->second;
+
+  // dont propagate beyond the immediate post dominator of the branch
+  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+  const auto *IpdNode = PdNode->getIDom();
+  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+  
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  const auto &TermBlock = *Term.getParent();
+  
+  // AMDGPU CHANGE
+  // Make sure the post-dominator is outside the loop for the loop header.
+  // Otherwise, we may not find all the join blocks in the loop 
+  // because the search stops too early. Some join points can be reached
+  // after the post-dominator!
+  //
+  // Problem CFG is below:
+  //
+  //     +--> A
+  //     |   / \
+  //     |  B   P
+  //     |  | / |
+  //     +--L   X
+  //
+  // In this cfg, A is the loop header and P is A's post-dominator.
+  // The algorithm to mark join points does an Reverse Post Order walk
+  // from A and stops when it reaches the post dominator. It would not 
+  // mark the phi node in L as divergent even when A had a divergent branch.
+  // The fix we made was to make the join point search continue all the way
+  // to the loops post dominator (which is X in this example).
+  //
+  // NOTE: They already made this change for the loop case above, but for
+  //       a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
+  //   
+  const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
+  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
+  {
+      while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+        IpdNode = IpdNode->getIDom();
+        PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+      }
+  }
+ 
+  auto JoinBlocks = Propagator.computeJoinPoints(
+      TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
+
+  // AMDGPU change begin.
+  // Save divergent join pairs.
+  updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
+                    *JoinBlocks.get());
+  // AMDGPU change end.
+
+  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
new file mode 100644
index 000000000000000..a52bcc7bc9e7c51
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -0,0 +1,98 @@
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <memory>
+#include <map>
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineDominatorTree;
+class MachineLoop;
+class MachinePostDominatorTree;
+class MachineLoopInfo;
+class MachineFunction;
+class MachineInstr;
+
+using DivergentJoinMapTy =
+    llvm::DenseMap<const llvm::MachineBasicBlock *,
+                   llvm::SmallPtrSet<const llvm::MachineBasicBlock *, 4>>;
+
+using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+  void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+                      const MachineBasicBlock *defBlock);
+
+public:
+  bool inRegion(const MachineBasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+                         const MachineLoopInfo &LI,
+                         // AMDGPU change begin
+                         DivergentJoinMapTy &JoinMap
+                         // AMDGPU change end
+  );
+
+  /// \brief Computes divergent join points and loop exits caused by branch
+  /// divergence in \p Term.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from \p Term.
+  /// The set also contains loop exits if there two disjoint paths:
+  /// one from \p Term to the loop exit and another from \p Term to the loop
+  /// header. Those exit blocks are added to the returned set.
+  /// If L is the parent loop of \p Term and an exit of L is in the returned
+  /// set then L is a divergent loop.
+  const ConstBlockSet &join_blocks(const MachineInstr &Term);
+
+  /// \brief Computes divergent join points and loop exits (in the surrounding
+  /// loop) caused by the divergent loop exits of\p MachineLoop.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from the
+  /// loop exits of \p MachineLoop.
+  /// This treats the loop as a single node in \p MachineLoop's parent loop.
+  /// The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const MachineLoop &MachineLoop);
+
+private:
+  static ConstBlockSet EmptyBlockSet;
+
+  llvm::ReversePostOrderTraversal<const llvm::MachineFunction *> FuncRPOT;
+  const MachineDominatorTree &DT;
+  const MachinePostDominatorTree &PDT;
+  const MachineLoopInfo &LI;
+  // AMDGPU change begin.
+  DivergentJoinMapTy &DivergentJoinMap;
+  // AMDGPU change end.
+  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
+      CachedBranchJoins;
+};
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
new file mode 100644
index 000000000000000..648df7f724617f4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -0,0 +1,188 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+namespace llvm {
+
+// Other info which can help compare schedule result.
+float SchedScore::computeScore() const {
+  // Occupancy 1 cannot mix alu.
+  unsigned MixHidenAlu = Alu - MixAlu;
+  if (Occupancy == 1)
+    MixHidenAlu = 0;
+  return ((float)MemLatency - (float)MixHidenAlu) / (float)Occupancy -
+         LatencyHide;
+}
+float SchedScore::computeScore2() const {
+  float cycles = 0;
+  cycles = (MixAlu * Occupancy + MemLatency);
+  cycles /= Occupancy;
+  return cycles;
+}
+
+void SchedScore::sum(const SchedScore &s, unsigned loopDepth) {
+  unsigned loopCount = loopDepth > 0 ? std::pow(3, loopDepth) : 1;
+  LatencyHide += loopCount * s.LatencyHide;
+  MemLatency += loopCount * s.MemLatency;
+  MixAlu += loopCount * s.MixAlu;
+  Alu += loopCount * s.Alu;
+  Lds += loopCount * s.Lds;
+  SgprSpill |= s.SgprSpill;
+}
+bool SchedScore::isBetter(const SchedScore &s) const {
+  float score = computeScore();
+  float newScore = s.computeScore();
+  bool spillBetter = !SgprSpill && s.SgprSpill;
+  return spillBetter ? true : newScore >= score;
+}
+// Does more occupancy give more perf.
+bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
+  unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
+  // 10% is good enough.
+  if ((10*gain) >= Alu)
+    return true;
+  else
+    return false;
+}
+
+unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
+  unsigned latency = MemLatency;
+  return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+}
+
+// AMDGPULatencyTracker
+AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
+    : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
+
+void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
+  if (MI.isDebugInstr()) return;
+  int latency = SIII->getInstrLatency(ItinerayData, MI);
+  // If inside latency hide.
+  if (!LatencyMIs.empty()) {
+    bool bWaitCnt = false;
+    for (auto &MO : MI.operands()) {
+      if (MO.isReg()) {
+        unsigned reg = MO.getReg();
+        auto it = LatencyMIs.find(reg);
+        if (it != LatencyMIs.end()) {
+          bWaitCnt = true;
+          // If MI use mem result, update latency to mem latency.
+          int cycle = it->second;
+          if (cycle > latency)
+            latency = cycle;
+        }
+      }
+    }
+    // Update latency for each mem latency inst.
+    for (auto it = LatencyMIs.begin(); it != LatencyMIs.end();) {
+      auto prev = it;
+      auto l = (it++);
+      int cycle = l->second;
+      if (cycle <= latency) {
+        // Only left cycles.
+        // Remove the reg.
+        LatencyMIs.erase(prev);
+        if (bWaitCnt && cycle == latency) {
+          score.MemLatency += cycle;
+          // Only count memLatency once, the rest is hide.
+          bWaitCnt = false;
+        } else {
+          // Hide cycle or count mem latency?
+          score.LatencyHide += cycle;
+        }
+      } else {
+        l->second -= latency;
+        // Hide latency.
+        score.LatencyHide += latency;
+      }
+    }
+
+  } else {
+    // TODO: check branch/lds?
+    // TODO: check prevVAlu?
+    auto getAluStatus = [](const MachineInstr &MI,
+                           const llvm::SIInstrInfo *SIII) {
+      AluStatus status = AluStatus::Nothing;
+      if (SIII->isVALU(MI.getOpcode())) {
+        status = AluStatus::Vector;
+      } else if (SIII->isSALU(MI.getOpcode())) {
+        status = AluStatus::Scalar;
+      }
+      return status;
+    };
+    AluStatus status = getAluStatus(MI, SIII);
+
+    switch (prevStatus) {
+    case AluStatus::Nothing: {
+      score.Alu += latency;
+      score.MixAlu += latency;
+      prevStatus = status;
+    } break;
+    case AluStatus::Vector:
+    case AluStatus::Scalar: {
+      score.Alu += latency;
+      // Ignore mix alu.
+      if (prevStatus != status) {
+        prevStatus = AluStatus::Nothing;
+      } else {
+        score.MixAlu += latency;
+      }
+    } break;
+    }
+  }
+  // Update latency inst.
+  if (SIII->isHighLatencyInstruction(MI) && MI.mayLoad()) {
+    unsigned reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kHighLetency = 180;
+    LatencyMIs[reg] = kHighLetency;
+  } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) {
+    unsigned reg = MI.getOperand(0).getReg();
+    // TODO: get correct latency.
+    // SIII->getInstrLatency(ItinerayData, MI);
+    constexpr unsigned kLowLetency = 35;
+    LatencyMIs[reg] = kLowLetency;
+  }
+}
+
+SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI) {
+  SchedScore totalScore;
+  for (auto &MFI : MF) {
+    MachineBasicBlock &MBB = MFI;
+    MachineBasicBlock::iterator Next;
+    AMDGPULatencyTracker latencyTracker(ST);
+    for (auto &MI : MBB) {
+      latencyTracker.scan(MI);
+    }
+    unsigned loopDepth = 0;
+    if (MLI) {
+      loopDepth = MLI->getLoopDepth(&MBB);
+    }
+    totalScore.sum(latencyTracker.score, loopDepth);
+  }
+  return totalScore;
+}
+
+} // namespace llvm
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
new file mode 100644
index 000000000000000..f108bab24bd3907
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -0,0 +1,74 @@
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Helper functions for occupancy and latency.
+//
+//===--------------------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class MachineFunction;
+class GCNSubtarget;
+class MachineInstr;
+class SIInstrInfo;
+class MachineLoopInfo;
+
+struct SchedScore {
+  // Score for this Sched result.
+  unsigned Occupancy = 0;
+  bool SgprSpill = false;
+  unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass?
+  unsigned MemLatency = 0;  // Only save mem latency.
+                            // We want mem latency small and hide big. Compare
+                            // memLatency - hide * Occ, smaller is better.
+  unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
+  unsigned Lds = 0; // Todo: count lds.
+  SchedScore() {}
+
+  // Other info which can help compare schedule result.
+  float computeScore() const;
+  float computeScore2() const;
+
+  void sum(const SchedScore &s, unsigned loopDepth=0);
+  bool isBetter(const SchedScore &s) const;
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+  // More latency can be hiden with ExtraOcc.
+  unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
+};
+
+struct AMDGPULatencyTracker {
+  AMDGPULatencyTracker(const llvm::GCNSubtarget &ST);
+  const llvm::SIInstrInfo *SIII;
+  const llvm::InstrItineraryData *ItinerayData;
+  // Latency MI dst reg to cycle map.
+  llvm::DenseMap<unsigned, int> LatencyMIs;
+  SchedScore score;
+  // Low latency MI not wait.
+  unsigned hideLatency = 0;
+  unsigned memLatency = 0;
+  // For simple, only consider mixture as one valu one salu.
+  // Not group now.
+  unsigned prevSAlu = 0;
+  unsigned prevVAlu = 0;
+  enum class AluStatus {
+    Nothing,
+    Vector,
+    Scalar,
+  } prevStatus = AluStatus::Nothing;
+  void scan(const llvm::MachineInstr &MI);
+};
+
+SchedScore CollectLatency(llvm::MachineFunction &MF,
+                          const llvm::GCNSubtarget &ST,
+                          const llvm::MachineLoopInfo *MLI = nullptr);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
new file mode 100644
index 000000000000000..a0f2a5d4dc121b5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -0,0 +1,1790 @@
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+//#include "dxc/DXIL/DxilMetadataHelper.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "GCNRegPressure.h"
+#include "AMDGPUMIRUtils.h"
+#include "AMDGPUSubExpDag.h"
+#include <unordered_set>
+
+#define DEBUG_TYPE "xb-sub-exp-dag"
+using namespace llvm;
+
+namespace llvm {
+
+// Expression Dag.
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
+    dbgs() << "\nSubExp:\n";
+    dbgs() << "input regs:\n";
+    for (auto &input : inputLive) {
+      pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+      dbgs() << "\n";
+    }
+    dbgs() << "output regs:\n";
+    for (auto &output : outputLive) {
+      pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+      dbgs() << "\n";
+    }
+
+    for (MachineInstr *MI : SUnits) {
+      MI->dump();
+    }
+    dbgs() << "End of SubExp\n";
+}
+#endif
+
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
+{
+    for (const MachineInstr *MI : SUnits)
+    {
+        if (MI->modifiesRegister(Reg, SIRI))
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
+                             const SIRegisterInfo *SIRI) {
+  sMaxSize = std::max(sInputSize, sOutputSize);
+  vMaxSize = std::max(vInputSize, vOutputSize);
+
+  DenseMap<unsigned, LaneBitmask> LiveRegs;
+  GCNRegPressure CurPressure;
+
+  // Add output to pressure.
+  for (MachineInstr *MI : BottomRoots) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+      LaneBitmask mask = getRegMask(MO, MRI);
+      auto it = LiveRegs.find(Reg);
+      if (it != LiveRegs.end()) {
+        LiveRegs[Reg] = mask | it->second;
+      } else {
+        LiveRegs[Reg] = mask;
+      }
+    }
+  }
+
+  for (auto it : LiveRegs) {
+    LaneBitmask emptyMask;
+    CurPressure.inc(it.first, emptyMask, it.second, MRI);
+  }
+
+  for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
+    MachineInstr *MI = *it;
+    auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual()) {
+        if (Reg == AMDGPU::SCC)
+          bTouchSCC = true;
+        continue;
+      }
+
+      LaneBitmask LiveMask = getRegMask(MO, MRI);
+      LaneBitmask PrevMask;
+      auto liveIt = LiveRegs.find(Reg);
+      if (liveIt != LiveRegs.end()) {
+        PrevMask = liveIt->second;
+      }
+
+      if (MO.isDef()) {
+        LiveMask = PrevMask & (~(LiveMask));
+      } else {
+        LiveMask = PrevMask | LiveMask;
+      }
+
+      CurPressure.inc(Reg, PrevMask, LiveMask, MRI);
+      LiveRegs[Reg] = LiveMask;
+    }
+
+    unsigned sSize = CurPressure.getSGPRNum();
+    unsigned vSize = CurPressure.getVGPRNum(ST->hasGFX90AInsts());
+    if (sSize > sMaxSize)
+      sMaxSize = sSize;
+    if (vSize > vMaxSize)
+      vMaxSize = vSize;
+  }
+}
+
+bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
+  if (bMultiDefOutput)
+    return false;
+  if (bHasTerminatorInst)
+    return false;
+  if (bUseIncomingReg)
+    return false;
+
+  // Input should be single def.
+  for (unsigned Reg : TopRegs) {
+    if (!MRI.hasOneDef(Reg) && !llvm::IsSub0Sub1SingleDef(Reg, MRI))
+      return false;
+  }
+  return true;
+}
+
+ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
+               const llvm::SIRegisterInfo *SIRI,
+               const SIInstrInfo *SIII, const bool bJoinInput)
+    : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
+
+template <typename T>
+void ExpDag::initNodes(const LiveSet &InputLiveReg, T &insts) {
+  unsigned NodeSize = InputLiveReg.size() + insts.size();
+  SUnits.reserve(NodeSize);
+
+  for (MachineInstr *MI : insts) {
+    if (MI->isDebugInstr())
+      continue;
+    SUnits.emplace_back(MI, SUnits.size());
+    SUnit *SU = &SUnits.back();
+    SUnitMIMap[SU] = MI;
+    MISUnitMap[MI] = SU;
+  }
+
+  for (auto it : InputLiveReg) {
+    unsigned Reg = it.first;
+    SUnits.emplace_back();
+    SUnit *SU = &SUnits.back();
+    SU->NodeNum = SUnits.size() - 1;
+    SUnitInputMap[SU] = Reg;
+    InputSUnitMap[Reg] = SU;
+  }
+}
+
+template void ExpDag::initNodes<DenseSet<MachineInstr *>>(
+    const LiveSet &InputLiveReg, DenseSet<MachineInstr *> &instRange);
+
+template void ExpDag::initNodes<std::vector<MachineInstr *>>(
+    const LiveSet &InputLiveReg, std::vector<MachineInstr *> &instRange);
+
+template <typename T>
+void ExpDag::build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+                   T &insts) {
+  initNodes(InputLiveReg, insts);
+  addDataDep(SIRI);
+  addCtrlDep();
+  buildSubExp(InputLiveReg, OutputLiveReg, SIRI, SIII);
+}
+
+template void
+ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
+                                        const LiveSet &OutputLiveReg,
+                                        DenseSet<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
+                                               const LiveSet &OutputLiveReg,
+                                               std::vector<MachineInstr *> &instRange);
+
+void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+                         const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  IntEqClasses SubtreeClasses(SUnits.size());
+  std::vector<unsigned> passThruInputs;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+      passThruInputs.emplace_back(SU.NodeNum);
+      continue;
+    }
+    if (!bJoinInputToSubExp && !SU.isInstr())
+      continue;
+    // Join prev.
+    for (SDep &PreDep : SU.Preds) {
+      SUnit *PreSU = PreDep.getSUnit();
+      if (!bJoinInputToSubExp && !PreSU->isInstr())
+        continue;
+      SubtreeClasses.join(SU.NodeNum, PreSU->NodeNum);
+    }
+    // Join succ.
+    for (SDep &SucDep : SU.Succs) {
+      SUnit *SucSU = SucDep.getSUnit();
+      SubtreeClasses.join(SU.NodeNum, SucSU->NodeNum);
+    }
+  }
+  SubtreeClasses.compress();
+
+  unsigned NumSubExps = SubtreeClasses.getNumClasses();
+  // Not count passThruInputs for subExps since they're exp with only 1 SU.
+  // SubExpIndexMap is used to pack SubIdx within updated NumSubExps.
+  NumSubExps -= passThruInputs.size();
+  SubExps.resize(NumSubExps);
+  DenseMap<unsigned, unsigned> SubExpIndexMap;
+
+  // Add SU to sub exp.
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0 && SU.NumSuccsLeft == 0) {
+      continue;
+    }
+    unsigned SubIdx = SubtreeClasses[SU.NodeNum];
+    unsigned OriginSubIdx = SubIdx;
+    // Pack subidx.
+    if (SubExpIndexMap.count(SubIdx) == 0) {
+      unsigned count = SubExpIndexMap.size();
+      SubExpIndexMap.insert(std::make_pair(SubIdx, count));
+    }
+    SubIdx = SubExpIndexMap[SubIdx];
+    // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+    SU.NodeQueueId = SubIdx;
+
+    SubExp &Exp = SubExps[SubIdx];
+    auto it = SUnitInputMap.find(&SU);
+    if (it != SUnitInputMap.end()) {
+      // Input.
+      unsigned Reg = it->second;
+      Exp.TopRegs.insert(Reg);
+    } else {
+      MachineInstr *MI = SU.getInstr();
+      MachineBasicBlock *MBB = MI->getParent();
+      Exp.FromBB = MBB;
+      for (MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isUse())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (MRI.getLiveInPhysReg(Reg) || MRI.getLiveInVirtReg(Reg)) {
+          Exp.bUseIncomingReg = true;
+        }
+      }
+
+      Exp.SUnits.emplace_back(MI);
+      if (SU.NumSuccsLeft == 0) {
+        Exp.BottomRoots.insert(MI);
+        if (MI->isTerminator())
+          Exp.bHasTerminatorInst = true;
+      }
+      if (MI->isNotDuplicable())
+        Exp.bNotSafeToCopy = true;
+      // Skip Scalar mem access since no scalar store.
+      if (MI->mayLoadOrStore() && !SIII->isSMRD(*MI)) {
+        Exp.bHasMemInst = true;
+      }
+      // Add bottom regs.
+      for (MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg())
+          continue;
+        if (!MO.isDef())
+          continue;
+        Register Reg = MO.getReg();
+        // physical reg is not in live reg.
+        if (!Reg.isVirtual())
+          continue;
+        if (SU.NumSuccsLeft) {
+          // For SU which has used in current blk.
+          // Check if used in other blks or subExps.
+          bool bUsedInOtherBlk = false;
+          for (auto &UserMI : MRI.use_nodbg_instructions(Reg)) {
+            if (UserMI.getParent() != MBB) {
+              bUsedInOtherBlk = true;
+              break;
+            }
+            auto suIt = MISUnitMap.find(&UserMI);
+            // When UserMI is not in dag, treat it as other block.
+            if (suIt == MISUnitMap.end()) {
+              bUsedInOtherBlk = true;
+              break;
+            }
+            SUnit *UseSU = suIt->second;
+            // UserMI should always be in same subExp.
+            unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
+            if (UseSubIdx != OriginSubIdx) {
+              // When reg has multiple def, it is possible for user def in different subExp.
+              if (MRI.getUniqueVRegDef(Reg))
+                llvm::report_fatal_error("user and def in different subExp");
+              break;
+            }
+          }
+          if (!bUsedInOtherBlk)
+            continue;
+        }
+        Exp.BottomRegs.insert(Reg);
+        if (!MRI.getUniqueVRegDef(Reg)) {
+          Exp.bMultiDefOutput = true;
+        }
+      }
+    }
+  }
+  // Calc reg for SubExp.
+  // Get block live in and live out.
+  // Only reg will miss live mask.
+  for (SubExp &Exp : SubExps) {
+    for (unsigned Reg : Exp.TopRegs) {
+      auto it = StartLiveReg.find(Reg);
+      assert(it != StartLiveReg.end() &&
+             "cannot find input reg in block start live");
+      Exp.inputLive[Reg] |= it->second;
+    }
+
+    for (unsigned Reg : Exp.BottomRegs) {
+      auto it = EndLiveReg.find(Reg);
+      if (it == EndLiveReg.end()) {
+        //"cannot find output reg in block end live");
+        // Bottom reg is killed inside current block, did not get out of the
+        // block.
+        // Or the bottom reg is not treat as output in this dag, not save to
+        // outputLive which will affect profit count.
+        continue;
+      }
+      Exp.outputLive[Reg] |= it->second;
+    }
+
+    CollectLiveSetPressure(Exp.inputLive, MRI, SIRI, Exp.vInputSize,
+                           Exp.sInputSize);
+    CollectLiveSetPressure(Exp.outputLive, MRI, SIRI, Exp.vOutputSize,
+                           Exp.sOutputSize);
+  }
+}
+
+void ExpDag::addDataDep(const SIRegisterInfo *SIRI) {
+  DenseMap<unsigned, MachineInstr *> curDefMI;
+
+  for (SUnit &SU : SUnits) {
+    if (!SU.isInstr())
+      continue;
+    MachineInstr *MI = SU.getInstr();
+
+    // Link use to the def.
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef())
+        continue;
+
+      Register Reg = MO.getReg();
+      SUnit *DefSU = nullptr;
+
+      auto curDefIt = curDefMI.find(Reg);
+      // Check def inst first.
+      if (curDefIt != curDefMI.end()) {
+        MachineInstr *curDef = curDefIt->second;
+        DefSU = MISUnitMap[curDef];
+      } else {
+        // physical reg is not in live reg.
+        if (!Reg.isVirtual())
+          continue;
+        if (MO.isUndef())
+          continue;
+        // Is it OK for degbug instr MO cannot find def?
+        if (MI->isDebugInstr())
+          continue;
+        // Should be an input.
+        assert(InputSUnitMap.count(Reg) > 0 && "cannot find def");
+        DefSU = InputSUnitMap[Reg];
+      }
+      SU.addPred(SDep(DefSU, SDep::Data, Reg));
+    }
+
+    // Add def to curDefMI;
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      // For case like:
+      // undef %808.sub0:sgpr_64 = COPY killed %795:sgpr_32
+      // %808.sub1:sgpr_64 = S_MOV_B32 0
+      // When partially write, link MI to previous def.
+      if (MO.getSubReg() != 0) {
+        SUnit *DefSU = nullptr;
+        auto curDefIt = curDefMI.find(Reg);
+        // Check def inst first.
+        if (curDefIt != curDefMI.end()) {
+          MachineInstr *curDef = curDefIt->second;
+          DefSU = MISUnitMap[curDef];
+          // Add link between different defs.
+          SU.addPred(SDep(DefSU, SDep::Data, Reg));
+        }
+      }
+
+      curDefMI[Reg] = MI;
+    }
+  }
+}
+
+void ExpDag::addCtrlDep() {
+  // TODO: add depend for memory, barrier.
+}
+
+BlockExpDag::BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+                         const llvm::MachineRegisterInfo &MRI,
+                         const llvm::SIRegisterInfo *SIRI,
+                         const llvm::SIInstrInfo *SIII)
+    : ExpDag(MRI, SIRI, SIII, /*bJoinInput*/ true), LIS(LIS), MBB(B) {}
+
+void BlockExpDag::build() {
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+  const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+  const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+  const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+  std::vector<MachineInstr *> insts;
+  for (MachineInstr &MI : *MBB) {
+    insts.emplace_back(&MI);
+  }
+
+  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+}
+
+void BlockExpDag::buildWithPressure() {
+  auto *SlotIndexes = LIS->getSlotIndexes();
+  const auto StartIdx = SlotIndexes->getMBBStartIdx(MBB);
+  const auto StartLiveReg = llvm::getLiveRegs(StartIdx, *LIS, MRI);
+
+  const auto EndIdx = SlotIndexes->getMBBEndIdx(MBB);
+  const auto EndLiveReg = llvm::getLiveRegs(EndIdx, *LIS, MRI);
+
+  std::vector<MachineInstr *> insts;
+  for (MachineInstr &MI : *MBB) {
+    insts.emplace_back(&MI);
+  }
+
+  ExpDag::build(StartLiveReg, EndLiveReg, insts);
+  // Build pressure.
+  buildPressure(StartLiveReg, EndLiveReg);
+}
+
+void BlockExpDag::buildAvail(
+    const LiveSet &passThruSet,
+    DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+  DenseSet<SUnit *> Processed;
+
+  DenseSet<SUnit *> WorkList;
+  MachineInstr &BeginMI = MBB->instr_front();
+
+  // Calc avaialbe for each node, live is avail & sum(input of success).
+  // If a reg is avaiable from the node, then success node can use it from this
+  // node. For dag live, pred output don't need to have all input a node needs.
+  // As long as all pred outputs can cover inputs, it is OK.
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0) {
+      GCNDownwardRPTracker RP(*LIS);
+      RP.reset(BeginMI, &passThruSet);
+      MachineInstr *MI = SU.getInstr();
+      if (MI) {
+        RP.reset(*MI, &passThruSet);
+        RP.advance();
+      }
+      DagAvailRegMap[&SU] = RP.getLiveRegs();
+
+      // Add succ to work list.
+      for (auto &Succ : SU.Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU->NumPredsLeft > 0)
+          SuccSU->NumPredsLeft--;
+        WorkList.insert(SuccSU);
+      }
+    }
+  }
+  while (!WorkList.empty()) {
+    bool bUpdated = false;
+    SmallVector<SUnit *, 4> ReadyNodes;
+    for (SUnit *SU : WorkList) {
+      if (SU->NumPredsLeft > 0)
+        continue;
+      ReadyNodes.emplace_back(SU);
+      // Ready, move it to Processed.
+      Processed.insert(SU);
+      bUpdated = true;
+      // Only update 1 node once.
+      // Order of schedle here should not affect pressure.
+      break;
+    }
+
+    for (SUnit *SU : ReadyNodes) {
+      // Remove SU from worklist.
+      WorkList.erase(SU);
+
+      MachineInstr *MI = SU->getInstr();
+      // Calc pressure based on pred nodes.
+      GCNRPTracker::LiveRegSet dagLive;
+      for (auto &Pred : SU->Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        GCNRPTracker::LiveRegSet PredLive = DagAvailRegMap[PredSU];
+
+        GCNDownwardRPTracker RP(*LIS);
+        RP.reset(BeginMI, &PredLive);
+        if (MI) {
+          RP.reset(*MI, &PredLive);
+          // Update PredLive based on MI.
+          RP.advance();
+        }
+        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+      }
+      DagAvailRegMap[SU] = dagLive;
+
+      // Add succ to work list.
+      for (auto &Succ : SU->Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU->NumPredsLeft > 0)
+          SuccSU->NumPredsLeft--;
+        WorkList.insert(SuccSU);
+      }
+    }
+
+    // Skip dead loop
+    if (ReadyNodes.empty()) {
+      printf("dead loop when build dag pressure");
+      break;
+    }
+  }
+
+  assert(WorkList.empty() && "schedule failed for available reg");
+}
+
+void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
+                                const LiveSet &EndLiveReg) {
+  if (MBB->empty())
+    return;
+  DenseMap<SUnit *, GCNRPTracker::LiveRegSet> DagAvailRegMap;
+  GCNRPTracker::LiveRegSet passThruSet;
+  for (auto Reg : StartLiveReg) {
+    unsigned reg = Reg.first;
+    auto EndReg = EndLiveReg.find(reg);
+    if (EndReg == EndLiveReg.end())
+      continue;
+
+    LaneBitmask mask = Reg.second;
+    LaneBitmask endMask = EndReg->second;
+    mask &= endMask;
+    if (mask.getAsInteger() == 0)
+      continue;
+    passThruSet[reg] = mask;
+  }
+
+  // Build avial for each nodes.
+  buildAvail(passThruSet, DagAvailRegMap);
+
+  // Calc avaialbe for each node, live is avail & sum(input of success).
+  // If a reg is avaiable from the node, then success node can use it from this
+  // node. For dag live, pred output don't need to have all input a node needs.
+  // As long as all pred outputs can cover inputs, it is OK.
+  DenseSet<SUnit *> Processed;
+
+  DenseSet<SUnit *> WorkList;
+  MachineInstr &BeginMI = MBB->instr_front();
+
+  for (SUnit &SU : SUnits) {
+    if (SU.NumSuccsLeft == 0) {
+      // Calc pressure based on pass thru.
+      // Using pass thru as base because output of current SU should not
+      // affect other output SUs.
+      GCNUpwardRPTracker RP(*LIS);
+      RP.reset(BeginMI, &passThruSet, /*After*/true);
+      MachineInstr *MI = SU.getInstr();
+      if (MI) {
+        RP.reset(*MI, &passThruSet, /*After*/true);
+        RP.recede(*MI);
+      }
+      DagPressureMap[&SU] = RP.getLiveRegs();
+      // Add pred to work list.
+      for (auto &Pred : SU.Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        PredSU->NumSuccsLeft--;
+        WorkList.insert(PredSU);
+      }
+    }
+  }
+
+  while (!WorkList.empty()) {
+    bool bUpdated = false;
+    SmallVector<SUnit *, 4> ReadyNodes;
+    for (SUnit *SU : WorkList) {
+      if (SU->NumSuccsLeft > 0)
+        continue;
+      ReadyNodes.emplace_back(SU);
+      // Ready, move it to Processed.
+      Processed.insert(SU);
+      bUpdated = true;
+      // Only update 1 node once.
+      // Order of schedle here should not affect pressure.
+      break;
+    }
+
+    for (SUnit *SU : ReadyNodes) {
+      // Remove SU from worklist.
+      WorkList.erase(SU);
+
+      MachineInstr *MI = SU->getInstr();
+      // Calc pressure based on succ nodes.
+      GCNRPTracker::LiveRegSet dagLive;
+      for (auto &Succ : SU->Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
+
+        GCNUpwardRPTracker RP(*LIS);
+        RP.reset(BeginMI, &SuccLive, /*After*/true);
+        if (MI) {
+          RP.reset(*MI, &SuccLive, /*After*/true);
+          // Update SuccLive based on MI.
+          RP.recede(*MI);
+        }
+        llvm::mergeLiveRegSet(dagLive, RP.getLiveRegs());
+      }
+      // Remove live which not avail in SU.
+      GCNRPTracker::LiveRegSet availLive = DagAvailRegMap[SU];
+      llvm::andLiveRegSet(dagLive, availLive);
+      DagPressureMap[SU] = dagLive;
+
+      // Add pred to work list.
+      for (auto &Pred : SU->Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        PredSU->NumSuccsLeft--;
+        WorkList.insert(PredSU);
+      }
+    }
+
+    // Skip dead loop
+    if (ReadyNodes.empty()) {
+      printf("dead loop when build dag pressure");
+      break;
+    }
+  }
+}
+
+// dump functions.
+
+std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  auto it = SUnitInputMap.find(SU);
+  if (it != SUnitInputMap.end()) {
+    oss << "<input:" << llvm::printReg(it->second) << ">";
+  } else {
+    SU->getInstr()->print(oss, /*SkipOpers=*/true);
+  }
+
+  return oss.str();
+}
+
+/// Return the label.
+std::string ExpDag::getDAGName() const {
+  return "dag.exp";
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ExpDag::viewGraph(const Twine &Name, const Twine &Title) const {
+#if 0 // TODO: Re-enable this
+  // This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(const_cast<ExpDag *>(this), Name, false, Title);
+#else
+  errs() << "BlockExpDag::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+#endif
+}
+
+void ExpDag::dump() {
+  viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
+}
+
+}
+
+// Expression Dag dump.
+namespace llvm {
+
+static DenseSet<const SUnit *> ViewNodes;
+
+template <>
+struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const llvm::ExpDag *G) {
+    return "ExpDag graph";
+  }
+
+  static bool renderGraphFromBottomUp() { return true; }
+
+  static bool isNodeHidden(const SUnit *Node) {
+    if (ViewNodes.empty())
+      return false;
+
+    return ViewNodes.count(Node) == 0;
+  }
+
+  static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                            const llvm::ExpDag *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+                                       const llvm::ExpDag *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const llvm::ExpDag *Graph) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    SS << "SU:" << SU->NodeNum;
+    return SS.str();
+  }
+  static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const llvm::ExpDag *Graph) {
+    std::string Str("shape=Mrecord");
+
+    Str += ",style=filled,fillcolor=\"#";
+    // Use NodeQueueId as SubIdx for ExpDag.
+    Str += DOT::getColorString(N->NodeQueueId);
+    Str += '"';
+
+    return Str;
+  }
+
+  static void addCustomGraphFeatures(llvm::ExpDag *G,
+                                     GraphWriter<llvm::ExpDag *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
+
+template <> struct GraphTraits<llvm::ExpDag *> : public GraphTraits<SUnit *> {
+  using nodes_iterator = pointer_iterator<std::vector<SUnit>::iterator>;
+  static nodes_iterator nodes_begin(llvm::ExpDag *G) {
+    return nodes_iterator(G->SUnits.begin());
+  }
+  static nodes_iterator nodes_end(llvm::ExpDag *G) {
+    return nodes_iterator(G->SUnits.end());
+  }
+};
+
+} // namespace llvm
+
+namespace llvm {
+void getRegBound(llvm::MachineBasicBlock *MBB,
+                 const llvm::MachineRegisterInfo &MRI,
+                 const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                 llvm::LiveIntervals *LIS, unsigned &MaxVGPR,
+                 unsigned &MaxSGPR) {
+  // TODO: calc real reg bound.
+  MaxVGPR = AMDGPU::VGPR255 - AMDGPU::VGPR0;
+  MaxSGPR = AMDGPU::SGPR104 - AMDGPU::SGPR0;
+
+  const auto &EndSlot = LIS->getMBBEndIdx(MBB);
+  const GCNRPTracker::LiveRegSet outputLive =
+      llvm::getLiveRegs(EndSlot, *LIS, MRI);
+
+  auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+  if (MBB->empty()) {
+    GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
+    MaxSGPR = MaxPressure.getSGPRNum();
+    MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+    return;
+  }
+
+  BlockExpDag dag(MBB, LIS, MRI, SIRI, SIII);
+  dag.build();
+
+  std::vector<SUnit> &SUnits = dag.SUnits;
+  // Remove input nodes.
+  for (SUnit &SU : SUnits) {
+    if (!SU.isInstr())
+      continue;
+    std::vector<SDep> inputDeps;
+    for (SDep &Dep : SU.Preds) {
+      SUnit *Pred = Dep.getSUnit();
+      if (Pred->isInstr())
+        continue;
+      inputDeps.emplace_back(Dep);
+    }
+    for (SDep &Dep : inputDeps) {
+      SU.removePred(Dep);
+    }
+  }
+
+  unsigned inputSize = dag.InputSUnitMap.size();
+  unsigned instNodeSize = SUnits.size() - inputSize;
+  SUnits.erase(SUnits.begin() + instNodeSize, SUnits.end());
+
+  std::vector<llvm::SUnit *> BotRoots;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumSuccsLeft == 0)
+      BotRoots.emplace_back(&SU);
+  }
+
+  auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
+
+  GCNUpwardRPTracker RPTracker(*LIS);
+  RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+  for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
+    const SUnit *SU = *it;
+    if (!SU->isInstr())
+      continue;
+    MachineInstr *MI = SU->getInstr();
+    RPTracker.recede(*MI);
+  }
+
+  GCNRegPressure MaxPressure = RPTracker.getMaxPressureAndReset();
+  MaxSGPR = MaxPressure.getSGPRNum();
+  MaxVGPR = MaxPressure.getVGPRNum(ST->hasGFX90AInsts());
+}
+} // namespace llvm
+
+// HRB
+namespace {
+
+std::vector<SUnit *> buildWorkList(std::vector<llvm::SUnit> &SUnits) {
+  std::vector<SUnit *> resultList;
+  resultList.reserve(SUnits.size());
+  for (SUnit &SU : SUnits) {
+    resultList.emplace_back(&SU);
+  }
+  return resultList;
+}
+
+void sortByHeight(std::vector<SUnit *> &workList) {
+  std::sort(workList.begin(), workList.end(),
+            [](const SUnit *a, const SUnit *b) {
+              // Lowest height first.
+              if (a->getHeight() < b->getHeight())
+                return true;
+              // If height the same, NodeNum big first.
+              if (a->getHeight() == b->getHeight())
+                return a->NodeNum > b->NodeNum;
+              return false;
+            });
+}
+
+void sortByInChain(std::vector<SUnit *> &workList, DenseSet<SUnit *> &Chained) {
+  // In chain nodes at end.
+  std::sort(workList.begin(), workList.end(),
+            [&Chained](const SUnit *a, const SUnit *b) {
+              return Chained.count(a) < Chained.count(b);
+            });
+}
+
+const TargetRegisterClass *getRegClass(SUnit *SU,
+                                       const MachineRegisterInfo &MRI,
+                                       const SIRegisterInfo *SIRI) {
+  if (!SU->isInstr())
+    return nullptr;
+  MachineInstr *MI = SU->getInstr();
+  if (MI->getNumDefs() == 0)
+    return nullptr;
+
+  // For MI has more than one dst, always use first dst.
+  MachineOperand *MO = MI->defs().begin();
+  if (!MO->isReg())
+    return nullptr;
+  unsigned Reg = MO->getReg();
+  return SIRI->getRegClassForReg(MRI, Reg);
+}
+
+unsigned getVGPRSize(const TargetRegisterClass *RC,
+                     const SIRegisterInfo *SIRI) {
+  if (!RC)
+    return 0;
+  if (SIRI->isSGPRClass(RC))
+    return 0;
+  return RC->getLaneMask().getNumLanes();
+}
+unsigned getSGPRSize(const TargetRegisterClass *RC,
+                     const SIRegisterInfo *SIRI) {
+  if (!RC)
+    return 0;
+  if (!SIRI->isSGPRClass(RC))
+    return 0;
+  return RC->getLaneMask().getNumLanes();
+}
+
+void collectSameHeightBackNodes(SUnit *SU, SmallDenseSet<SUnit *, 2> &backNodes,
+                                unsigned NodeNum,
+                                SmallDenseSet<SUnit *, 4> &visitedNodes) {
+  if (visitedNodes.count(SU))
+    return;
+  visitedNodes.insert(SU);
+
+  for (SDep &Dep : SU->Succs) {
+    if (Dep.isWeak())
+      continue;
+    if (Dep.getLatency() > 0)
+      continue;
+
+    SUnit *Succ = Dep.getSUnit(); /*
+     if (Succ->NodeNum >= NodeNum)
+       continue;*/
+
+    backNodes.insert(Succ);
+    collectSameHeightBackNodes(Succ, backNodes, NodeNum, visitedNodes);
+  }
+}
+
+} // namespace
+
+namespace llvm {
+
+void HRB::Lineage::addNode(llvm::SUnit *SU) { Nodes.emplace_back(SU); }
+unsigned HRB::Lineage::getSize() const {
+  return RC ? RC->getLaneMask().getNumLanes() : 0;
+}
+unsigned HRB::Lineage::length() const { return Nodes.size(); }
+
+SUnit *HRB::Lineage::getHead() const { return Nodes.front(); }
+SUnit *HRB::Lineage::getTail() const { return Nodes.back(); }
+
+void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
+  // Working list from TopRoots.
+  std::vector<SUnit *> workList = buildWorkList(SUnits);
+  IntEqClasses EqClasses(SUnits.size());
+
+  while (!workList.empty()) {
+    sortByHeight(workList);
+    // Highest SU.
+    SUnit *SU = workList.back();
+    workList.pop_back();
+    if (!SU->isInstr())
+      continue;
+    if (ChainedNodes.count(SU) > 0)
+      continue;
+    bRecomputeHeight = false;
+    Lineage lineage = buildChain(SU, SUnits);
+
+    // Remove chained nodes from worklist.
+    sortByInChain(workList, ChainedNodes);
+    while (!workList.empty()) {
+      SUnit *back = workList.back();
+      if (ChainedNodes.count(back))
+        workList.pop_back();
+      else
+        break;
+    }
+
+    Lineages.emplace_back(lineage);
+
+    if (bRecomputeHeight) {
+      // Update height from tail.
+      SUnit *tail = lineage.Nodes.back();
+      tail->setDepthDirty();
+      tail->getHeight();
+    }
+  }
+
+  DenseSet<SUnit *> tailSet;
+  for (Lineage &L : Lineages) {
+    if (L.Nodes.size() < 2)
+      continue;
+    auto it = L.Nodes.rbegin();
+    it++;
+    SUnit *tail = L.Nodes.back();
+    // If already as tail for other lineage, start from next.
+    if (tailSet.count(tail) > 0) {
+      tail = *it;
+      it++;
+    } else {
+      tailSet.insert(tail);
+    }
+    for (; it != L.Nodes.rend(); it++) {
+      SUnit *SU = *it;
+      if (tail->NodeNum == -1)
+        continue;
+      EqClasses.join(SU->NodeNum, tail->NodeNum);
+    }
+  }
+
+  EqClasses.compress();
+  // TODO: assign sub class to node.
+  for (Lineage &L : Lineages) {
+    for (SUnit *SU : L.Nodes) {
+      if (SU->NodeNum == -1)
+        continue;
+      unsigned SubIdx = EqClasses[SU->NodeNum];
+      //// Pack subidx.
+      // if (EqClasses.count(SubIdx) == 0)
+      //  EqClasses[SubIdx] = EqClasses.size();
+      SubIdx = EqClasses[SubIdx];
+      // Use NodeQueueId as SubIdx. We don't do schedule on ExpDag.
+      SU->NodeQueueId = SubIdx;
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Chained Nodes:"; for (SUnit *SU
+                                       : ChainedNodes) {
+        dbgs() << " " << SU->NodeNum << "\n";
+      } for (int i = 0; i < Lineages.size(); i++) {
+        dbgs() << "Lineage" << i << ":";
+        Lineage &L = Lineages[i];
+        for (SUnit *SU : L.Nodes) {
+          dbgs() << " " << SU->NodeNum;
+        }
+        dbgs() << "\n";
+      });
+}
+
+SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
+  std::vector<SUnit *> Candidates;
+  for (SDep &Dep : SU->Succs) {
+    // Only check data dep.
+    if (Dep.getKind() != SDep::Data)
+      continue;
+
+    SUnit *Succ = Dep.getSUnit();
+    Candidates.emplace_back(Succ);
+  }
+
+  if (Candidates.empty())
+    return nullptr;
+
+  if (Candidates.size() == 1)
+    return Candidates.front();
+
+  sortByHeight(Candidates);
+  // Lowest height.
+  SUnit *Heir = Candidates.front();
+  SmallVector<SUnit *, 2> SameHeightCandidate;
+  for (SUnit *SU : Candidates) {
+    if (Heir->getHeight() != SU->getHeight())
+      break;
+    SameHeightCandidate.emplace_back(SU);
+  }
+  // Make sure choose lowest dependence between SameHeightCandidate.
+  if (SameHeightCandidate.size() > 1) {
+    for (int i = 1; i < SameHeightCandidate.size(); i++) {
+      SUnit *SU = SameHeightCandidate[i];
+      // If Heir is pred of SU, use SU.
+      if (canReach(SU, Heir))
+        Heir = SU;
+    }
+  }
+
+  unsigned HeriHeight = Heir->getHeight();
+
+  // if lowest node is in ChainedNodes, try to find same height nodes?
+
+  for (SDep &Dep : SU->Succs) {
+    // Only check data dep.
+    if (Dep.getKind() != SDep::Data)
+      continue;
+    SUnit *Succ = Dep.getSUnit();
+    if (Succ == Heir)
+      continue;
+    // Avoid cycle in DAG.
+    if (canReach(Heir, Succ))
+      return nullptr;
+    // Make sure Succ is before Heir.
+    Heir->addPred(SDep(Succ, SDep::Artificial));
+    updateReachForEdge(Succ, Heir, SUnits);
+    LLVM_DEBUG(dbgs() << "add edge from " << Succ->NodeNum << "("
+                      << Succ->getHeight() << ") to " << Heir->NodeNum << "("
+                      << HeriHeight << ")\n");
+    // Update height if need.
+    unsigned Height = Succ->getHeight();
+    if (Height <= HeriHeight) {
+      bRecomputeHeight = true;
+    }
+  }
+  return Heir;
+}
+
+HRB::Lineage HRB::buildChain(SUnit *Node,
+                             std::vector<llvm::SUnit> &SUnits) {
+  HRB::Lineage chain;
+  chain.addNode(Node);
+  ChainedNodes.insert(Node);
+  LLVM_DEBUG(dbgs() << "start chain " << Node->NodeNum << "("
+                    << Node->getHeight() << ")\n");
+  while (Node->NumSuccsLeft > 0) {
+    SUnit *Heir = findHeir(Node, SUnits);
+    if (!Heir)
+      break;
+    chain.addNode(Heir);
+
+    LLVM_DEBUG(dbgs() << "add node to chain " << Heir->NodeNum << "\n");
+    if (ChainedNodes.count(Heir) > 0)
+      break;
+    ChainedNodes.insert(Heir);
+
+    Node = Heir;
+  }
+  // Find biggest vgpr RC for the chain.
+  // TODO: Build conflict and allocate on each edge of the chain.
+  const TargetRegisterClass *RC = nullptr;
+  unsigned maxRCSize = 0;
+  for (SUnit *SU : chain.Nodes) {
+    const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+    unsigned RCSize = getVGPRSize(SuRC, SIRI);
+    if (RCSize > maxRCSize) {
+      maxRCSize = RCSize;
+      RC = SuRC;
+    }
+  }
+  if (!RC) {
+    // TODO: Find biggest sgpr RC.
+    unsigned maxRCSize = 0;
+    for (SUnit *SU : chain.Nodes) {
+      const TargetRegisterClass *SuRC = getRegClass(SU, MRI, SIRI);
+      unsigned RCSize = getSGPRSize(SuRC, SIRI);
+      if (RCSize > maxRCSize) {
+        maxRCSize = RCSize;
+        RC = SuRC;
+      }
+    }
+  }
+  chain.RC = RC;
+  return chain;
+}
+
+void HRB::buildConflict() {
+
+  for (unsigned i = 0; i < Lineages.size(); i++) {
+    Lineage &a = Lineages[i];
+    for (unsigned j = i + 1; j < Lineages.size(); j++) {
+      Lineage &b = Lineages[j];
+      if (isConflict(a, b)) {
+        Color.Conflicts[i].insert(j);
+        Color.Conflicts[j].insert(i);
+        LLVM_DEBUG(dbgs() << i << " conflict" << j << "\n");
+      }
+    }
+    // SelfConflict.
+    Color.Conflicts[i].insert(i);
+  }
+}
+
+bool HRB::canReach(llvm::SUnit *a, llvm::SUnit *b) {
+  auto it = ReachMap.find(a);
+  // If no reach info, treat as reach.
+  if (it == ReachMap.end())
+    return true;
+  DenseSet<SUnit *> &CurReach = it->second;
+  return CurReach.find(b) != CurReach.end();
+}
+
+void HRB::updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+                             std::vector<llvm::SUnit> &SUnits) {
+  DenseSet<SUnit *> &ReachA = ReachMap[a];
+  ReachA.insert(b);
+  DenseSet<SUnit *> &ReachB = ReachMap[b];
+  ReachA.insert(ReachB.begin(), ReachB.end());
+
+  for (SUnit &SU : SUnits) {
+    if (!canReach(&SU, a))
+      continue;
+
+    DenseSet<SUnit *> &CurReach = ReachMap[&SU];
+    CurReach.insert(ReachA.begin(), ReachA.end());
+  }
+}
+
+void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
+  // Add fake entry to do PostOrder traversal.
+  // SUnit using Pred to traversal, so need to Revrese post order.
+  SUnit FakeEntry;
+  SmallVector<SDep, 4> FakeDeps;
+  for (SUnit *Root : BotRoots) {
+    SDep Dep = SDep(Root, SDep::Artificial);
+    FakeEntry.addPred(Dep);
+    FakeDeps.emplace_back(Dep);
+  }
+
+  ReversePostOrderTraversal<SUnit *> RPOT(&FakeEntry);
+  for (SUnit *SU : RPOT) {
+    // Create Reach Set first.
+    ReachMap[SU].clear();
+  }
+  for (SUnit *SU : RPOT) {
+    DenseSet<SUnit *> &CurReach = ReachMap[SU];
+    // All Preds can reach SU and SU's reach.
+    for (SDep &Dep : SU->Preds) {
+      // Igonre week dep.
+      if (Dep.isWeak())
+        continue;
+      DenseSet<SUnit *> &PrevReach = ReachMap[Dep.getSUnit()];
+      PrevReach.insert(SU);
+      PrevReach.insert(CurReach.begin(), CurReach.end());
+    }
+    assert(CurReach.count(SU) == 0 && "dead loop");
+  }
+  // Remove fake entry.
+  for (SDep &Dep : FakeDeps) {
+    FakeEntry.removePred(Dep);
+  }
+  ReachMap.erase(&FakeEntry);
+
+  LLVM_DEBUG(for (Lineage &L
+                  : Lineages) {
+    for (SUnit *SU : L.Nodes) {
+      DenseSet<SUnit *> &CurReach = ReachMap[SU];
+      dbgs() << SU->NodeNum << " reach: ";
+      for (SUnit *R : CurReach) {
+        dbgs() << R->NodeNum << " ";
+      }
+      dbgs() << "\n";
+    }
+  });
+}
+
+bool HRB::isConflict(const Lineage &a, const Lineage &b) {
+  // Make conflict between sgpr and vgpr to help group lineages when share
+  // colors. Keep the conflict will group lineages in avoid mix use color in
+  // different sub exp.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  return r01 && r10;
+}
+bool HRB::canFuse(const Lineage &a, const Lineage &b) {
+  if (a.RC != b.RC) {
+    // no RC will not conflict with other nodes.
+    if (!a.RC)
+      return false;
+    if (!b.RC)
+      return false;
+    // SGRP and VGPR not conflict.
+    if (SIRI->isSGPRClass(a.RC) != SIRI->isSGPRClass(b.RC))
+      return false;
+  }
+  // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  return r01 != r10;
+}
+
+bool HRB::tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits) {
+
+  // Can Fuse if a.head reach b.tail but b.head not reach a.tail and vice versa.
+  SUnit *head0 = a.getHead();
+  SUnit *tail0 = a.getTail();
+  SUnit *head1 = b.getHead();
+  SUnit *tail1 = b.getTail();
+  DenseSet<SUnit *> &Reach0 = ReachMap[head0];
+  DenseSet<SUnit *> &Reach1 = ReachMap[head1];
+  bool r01 = Reach0.count(tail1) != 0;
+  bool r10 = Reach1.count(tail0) != 0;
+  if (r01 == r10)
+    return false;
+  Lineage *newHead = &a;
+  Lineage *newTail = &b;
+  if (r01) {
+    // a reach b, b cannot reach a.
+    // link a.tail->b.head.
+    newHead = &a;
+    newTail = &b;
+  } else {
+    // b reach a, a cannot reach b.
+    // link b.tail->a.head.
+    newHead = &b;
+    newTail = &a;
+  }
+
+  // Merge reg class.
+  const TargetRegisterClass *RC0 = newHead->RC;
+  const TargetRegisterClass *RC1 = newTail->RC;
+  unsigned RC0Size = getVGPRSize(RC0, SIRI);
+  unsigned RC1Size = getVGPRSize(RC1, SIRI);
+  if (RC1Size > RC0Size)
+    newHead->RC = RC1;
+  // Merge chain.
+  SUnit *fuseTail = newHead->getTail();
+  SUnit *fuseHead = newTail->getHead();
+  assert(ReachMap[fuseHead].count(fuseTail) == 0 && "");
+  fuseHead->addPred(SDep(fuseTail, SDep::Artificial));
+  LLVM_DEBUG(dbgs() << "fuse " << fuseTail->NodeNum << "->" << fuseHead->NodeNum
+                    << "\n");
+  // Update reach map.
+  updateReachForEdge(fuseTail, fuseHead, SUnits);
+  // Merge Nodes.
+  newHead->Nodes.append(newTail->Nodes.begin(), newTail->Nodes.end());
+  // Clear newTail.
+  newTail->Nodes.clear();
+  newTail->RC = nullptr;
+  return true;
+}
+
+void HRB::fusionLineages(std::vector<llvm::SUnit> &SUnits) {
+  if (Lineages.empty())
+    return;
+  bool bUpdated = true;
+  while (bUpdated) {
+    bUpdated = false;
+    int size = Lineages.size();
+    for (int i = 0; i < size; i++) {
+      Lineage &a = Lineages[i];
+      if (a.length() == 0)
+        continue;
+
+      for (int j = i + 1; j < size; j++) {
+        Lineage &b = Lineages[j];
+        if (b.length() == 0)
+          continue;
+        if (tryFuse(a, b, SUnits)) {
+          bUpdated = true;
+          if (a.length() == 0)
+            break;
+        }
+      }
+    }
+    // Remove empty lineages.
+    std::sort(Lineages.begin(), Lineages.end(),
+              [](const Lineage &a, const Lineage &b) {
+                return a.length() > b.length();
+              });
+    while (Lineages.back().length() == 0) {
+      Lineages.pop_back();
+    }
+  }
+  // Set ID after fusion.
+  unsigned ID = 0;
+  for (Lineage &L : Lineages) {
+    L.ID = ID++;
+  }
+}
+
+unsigned HRB::colorLineages(std::vector<Lineage *> &lineages,
+                            DenseMap<Lineage *, unsigned> &AllocMap,
+                            const unsigned Limit) {
+  // allocate long Lineage first. How about size of RC?
+  std::sort(lineages.begin(), lineages.end(),
+            [](const Lineage *a, const Lineage *b) {
+              // Make sure root allocate first.
+              return a->length() > b->length();
+            });
+
+  unsigned maxColor = 0;
+  const unsigned VGPR_LIMIT = 256 * 4;
+
+  for (Lineage *L : lineages) {
+    unsigned ID = L->ID;
+    auto &Conflict = Color.Conflicts[ID];
+    std::bitset<VGPR_LIMIT> colors;
+    for (unsigned j : Conflict) {
+      Lineage *C = &Lineages[j];
+      if (AllocMap.count(C) == 0)
+        continue;
+      unsigned c = AllocMap[C];
+      unsigned s = C->getSize();
+      for (unsigned i = 0; i < s; i++) {
+        unsigned pos = c + i;
+        colors.set(pos);
+      }
+    }
+
+    unsigned color = Limit;
+    unsigned size = L->getSize();
+    for (unsigned i = 0; i < Limit - size;) {
+      unsigned oldI = i;
+      for (unsigned j = 0; j < size; j++) {
+        unsigned pos = i + size - 1 - j;
+        if (colors.test(pos)) {
+          i = pos + 1;
+          break;
+        }
+      }
+
+      if (i != oldI)
+        continue;
+      color = i;
+      break;
+    }
+
+    AllocMap[L] = color;
+    color += size;
+    if (color > maxColor)
+      maxColor = color;
+  }
+  return maxColor;
+}
+
+void HRB::ColorResult::colorSU(SUnit *SU, unsigned color) {
+  ColorMap[SU] = color;
+}
+
+unsigned HRB::ColorResult::getLineage(SUnit *SU) const {
+  return LineageMap.find(SU)->second;
+}
+
+bool HRB::ColorResult::isConflict(const SUnit *SU0, unsigned Lineage) const {
+  const unsigned L = LineageMap.find(SU0)->second;
+  const auto &Conflict = Conflicts.find(L)->second;
+  return Conflict.count(Lineage) > 0;
+}
+
+bool HRB::ColorResult::isHead(SUnit *SU) const { return HeadSet.count(SU); }
+bool HRB::ColorResult::isTail(SUnit *SU) const { return TailSet.count(SU); }
+
+const SUnit *HRB::ColorResult::getTail(SUnit *SU) const {
+  if (!isHead(SU))
+    return nullptr;
+  auto it = HeadTailMap.find(SU);
+  return it->second;
+}
+
+unsigned HRB::ColorResult::getColor(const llvm::SUnit *SU) const {
+  auto it = ColorMap.find(SU);
+  return it->second;
+}
+
+unsigned HRB::ColorResult::getSize(const llvm::SUnit *SU) const {
+  auto it = SizeMap.find(SU);
+  return it->second;
+}
+
+HRB::ColorResult &HRB::coloring() {
+  // Collect VGPR lineages.
+  std::vector<Lineage *> vgprLineages;
+  for (Lineage &L : Lineages) {
+    auto RC = L.RC;
+    if (!RC)
+      continue;
+    if (SIRI->isSGPRClass(RC))
+      continue;
+    vgprLineages.emplace_back(&L);
+  }
+
+  const unsigned VGPR_LIMIT = 256 * 4;
+  DenseMap<Lineage *, unsigned> VAllocMap;
+  const unsigned maxVGPR = colorLineages(vgprLineages, VAllocMap, VGPR_LIMIT);
+
+  // Collect SGPR lineages.
+  std::vector<Lineage *> sgprLineages;
+  for (Lineage &L : Lineages) {
+    auto RC = L.RC;
+    if (!RC)
+      continue;
+    if (!SIRI->isSGPRClass(RC))
+      continue;
+    sgprLineages.emplace_back(&L);
+  }
+
+  const unsigned SGPR_LIMIT = 104;
+  DenseMap<Lineage *, unsigned> SAllocMap;
+  const unsigned maxSGPR = colorLineages(sgprLineages, SAllocMap, SGPR_LIMIT);
+  // +1 for each type of lineages(SGPR, VGPR, no reg).
+  const unsigned maxReg = maxSGPR + 1 + maxVGPR + 1 + 1;
+  const unsigned sgprBase = maxVGPR + 1;
+
+  for (Lineage &L : Lineages) {
+    // Collect HeadSet.
+    Color.HeadSet.insert(L.getHead());
+    Color.TailSet.insert(L.getTail());
+    Color.HeadTailMap[L.getHead()] = L.getTail();
+    // Save color.
+    auto RC = L.RC;
+    // All no reg lineage goes to maxReg.
+    unsigned color = maxReg;
+    if (!RC) {
+    } else if (SIRI->isSGPRClass(RC)) {
+      color = SAllocMap[&L] + sgprBase;
+    } else {
+      color = VAllocMap[&L];
+    }
+    unsigned size = L.getSize();
+    for (SUnit *SU : L.Nodes) {
+      Color.colorSU(SU, color);
+      Color.SizeMap[SU] = size;
+      Color.LineageMap[SU] = L.ID;
+    }
+  }
+  Color.maxReg = maxReg;
+  Color.maxSGPR = maxSGPR;
+  Color.maxVGPR = maxVGPR;
+
+  for (unsigned i = 0; i < Lineages.size(); i++) {
+    Lineage &a = Lineages[i];
+    SUnit *headA = a.getHead();
+    unsigned colorA = Color.getColor(headA);
+    unsigned sizeA = Color.getSize(headA);
+    for (unsigned j = i + 1; j < Lineages.size(); j++) {
+      Lineage &b = Lineages[j];
+
+      SUnit *headB = b.getHead();
+      unsigned colorB = Color.getColor(headB);
+      unsigned sizeB = Color.getSize(headB);
+
+      if (colorB >= (colorA + sizeA))
+        continue;
+      if (colorA >= (colorB + sizeB))
+        continue;
+      Color.ShareColorLineages.insert(i);
+      Color.ShareColorLineages.insert(j);
+    }
+  }
+
+  return Color;
+}
+
+void HRB::dump() {
+  for (int i = 0; i < Lineages.size(); i++) {
+    dbgs() << "Lineage" << i << ":";
+    Lineage &L = Lineages[i];
+    for (SUnit *SU : L.Nodes) {
+      dbgs() << " " << SU->NodeNum;
+    }
+    dbgs() << "\n";
+    if (!Color.ColorMap.empty()) {
+      dbgs() << "color:" << Color.getColor(L.getHead())
+             << " size: " << Color.getSize(L.getHead()) << "\n";
+    }
+    if (!ReachMap.empty()) {
+      dbgs() << "conflict:";
+      for (int j = 0; j < Lineages.size(); j++) {
+        if (i == j)
+          continue;
+        if (isConflict(L, Lineages[j])) {
+          dbgs() << " " << j;
+        }
+      }
+      dbgs() << "\n";
+    }
+  }
+}
+
+void HRB::dumpReachMap() {
+  if (!ReachMap.empty()) {
+    dbgs() << "reachMap:";
+    for (auto it : ReachMap) {
+      SUnit *SU = it.first;
+      auto &Reach = it.second;
+      if (SU->isInstr()) {
+        MachineInstr *MI = SU->getInstr();
+        MI->print(dbgs());
+      }
+      dbgs() << SU->NodeNum << "can reach :\n";
+      for (SUnit *R : Reach) {
+        dbgs() << R->NodeNum << " ";
+      }
+      dbgs() << "\n";
+    }
+    dbgs() << "\n";
+  }
+}
+
+// schedule base on HRB lineages and color result.
+
+std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
+                                    std::vector<SUnit *> &BRoots,
+                                    const llvm::MachineRegisterInfo &MRI,
+                                    const llvm::SIRegisterInfo *SIRI) {
+  HRB hrb(MRI, SIRI);
+  // build reach info to avoid dead loop when build linear.
+  hrb.buildReachRelation(BRoots);
+  hrb.buildLinear(SUnits);
+
+  std::sort(BRoots.begin(), BRoots.end(), [](const SUnit *a, const SUnit *b) {
+    return a->NumSuccsLeft < b->NumSuccsLeft;
+  });
+  while (!BRoots.empty() && BRoots.back()->NumSuccsLeft > 0) {
+    BRoots.pop_back();
+  }
+
+  hrb.buildReachRelation(BRoots);
+  hrb.fusionLineages(SUnits);
+  hrb.buildConflict();
+  const HRB::ColorResult &Color = hrb.coloring();
+
+  LLVM_DEBUG(hrb.dump());
+
+  // All lineage head which don't has Pred is TopRoots.
+  // Put top roots in worklist.
+  // while worklist not empty.
+  //    if not head or color avail
+  //        is candidate.
+  //    choose best candidate by height.
+  //    update worklist.
+  std::vector<SUnit *> ReadyList;
+  for (SUnit &SU : SUnits) {
+    if (SU.NumPredsLeft == 0)
+      ReadyList.emplace_back(&SU); //.insert(&SU);
+  }
+  // When there're more than one sub exp in the DAG, make sure not mix different
+  // sub exp or it will dead loop for color goes different subexp.
+
+  std::bitset<512 * 2> colors;
+  auto isColorAvail = [&colors](unsigned color, unsigned size) -> bool {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      if (colors.test(pos))
+        return false;
+    }
+    return true;
+  };
+  auto allocColor = [&colors](unsigned color, unsigned size) {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      assert(!colors.test(pos) && "color already allocated");
+      LLVM_DEBUG(dbgs() << pos << "is allocated\n");
+      colors.set(pos);
+    }
+  };
+
+  auto freeColor = [&colors](unsigned color, unsigned size) {
+    for (unsigned i = 0; i < size; i++) {
+      unsigned pos = color + i;
+      assert(colors.test(pos) && "color has not been allocated");
+      LLVM_DEBUG(dbgs() << pos << "is free\n");
+      colors.reset(pos);
+    }
+  };
+
+  // Save color and size for tail to support case two lineage share tail.
+  // When finish a tail, free color for working lineage which end with tail.
+  DenseMap<const SUnit *,
+           SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2>>
+      TailMap;
+
+  // For lineages share same color, need to choose correct order.
+  // If l0 has color 0, l1 has color 1, l2 has color 0, l3 has color 1.
+  // l0 and l3 conflict, l1 and l2 conflict.
+  // l0 and l3 must sched together.
+  // If sched l0 and l1, it may dead lock for l0 wait something in l3 and l1
+  // wait something in l2.
+  // ShareColorLineages will mark lineages which share color with other
+  // lineages. When sched, choose new lineages which has more conflict with
+  // ShareColorLineages.
+  const DenseSet<unsigned> &ShareColorLineages = Color.ShareColorLineages;
+
+  std::vector<const SUnit *> Schedule;
+  DenseSet<unsigned> UnfinishedLineages;
+  while (!ReadyList.empty()) {
+    // Make sure node conflict with predLineage first.
+    std::sort(ReadyList.begin(), ReadyList.end(),
+              [&UnfinishedLineages, &Color](const SUnit *a, const SUnit *b) {
+                unsigned confA = 0;
+                for (unsigned L : UnfinishedLineages) {
+                  if (Color.isConflict(a, L))
+                    confA++;
+                }
+                unsigned confB = 0;
+                for (unsigned L : UnfinishedLineages) {
+                  if (Color.isConflict(b, L))
+                    confB++;
+                }
+                return confA > confB;
+              });
+
+    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
+                                              : ReadyList) {
+      dbgs() << " " << SU->NodeNum;
+    } dbgs() << "\n";);
+    SUnit *Candidate = nullptr;
+    for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+      SUnit *SU = *it;
+      unsigned color = Color.getColor(SU);
+      unsigned size = Color.getSize(SU);
+      // If SU is not head or color is available, SU is the candidate.
+      if (Color.isHead(SU)) {
+        if (!isColorAvail(color, size))
+          continue;
+        // alloc color.
+        allocColor(color, size);
+        // save tail color.
+        const SUnit *Tail = Color.getTail(SU);
+        unsigned ID = Color.getLineage(SU);
+        SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> &tailColors =
+            TailMap[Tail];
+        tailColors.emplace_back(std::make_tuple(color, size, ID));
+        if (ShareColorLineages.count(ID))
+          UnfinishedLineages.insert(ID);
+      }
+
+      // free color for working lineage which end with SU.
+      if (Color.isTail(SU)) {
+        auto &tailColors = TailMap[SU];
+        for (auto &tailTuple : tailColors) {
+          unsigned lineageColor, lineageSize, ID;
+          std::tie(lineageColor, lineageSize, ID) = tailTuple;
+          freeColor(lineageColor, lineageSize);
+          if (ShareColorLineages.count(ID))
+            UnfinishedLineages.insert(ID);
+        }
+        // Clear the tail.
+        TailMap.erase(SU);
+      }
+
+      Candidate = SU;
+      // Remove Candidate from ReadyList.
+      ReadyList.erase(it);
+      break;
+    }
+
+    if (!Candidate) {
+      // In case failed to find candidate, start a lineage if there is one.
+      for (auto it = ReadyList.begin(); it != ReadyList.end(); it++) {
+        SUnit *SU = *it;
+
+        if (!Color.isHead(SU)) {
+            continue;
+        }
+        Candidate = SU;
+        // Remove Candidate from ReadyList.
+        ReadyList.erase(it);
+        break;
+      }
+    }
+    assert(Candidate && "fail to find a Candidate");
+    LLVM_DEBUG(dbgs() << "Sched " << Candidate->NodeNum << "\n");
+
+    // Add all Candidate succ which is Ready.
+    for (SDep &Dep : Candidate->Succs) {
+      if (Dep.isWeak())
+        continue;
+      SUnit *Succ = Dep.getSUnit();
+
+      if (Succ->NumPredsLeft > 0)
+        Succ->NumPredsLeft--;
+      LLVM_DEBUG(dbgs() << "Succ " << Succ->NodeNum << " has "
+                        << Succ->NumPredsLeft << " preds\n");
+      if (Succ->NumPredsLeft == 0)
+        ReadyList.emplace_back(Succ);
+    }
+
+    // Sched Candidate.
+    assert(Candidate->isInstr() && "Candidate must be instr Node");
+    Schedule.emplace_back(Candidate);
+  }
+  assert(Schedule.size() == SUnits.size() && "SUnit size should match");
+  return Schedule;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
new file mode 100644
index 000000000000000..c234f3237079353
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/LaneBitmask.h"
+
+#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+
+namespace llvm {
+class MachineFunction;
+class LiveIntervals;
+class MachineRegisterInfo;
+class SIRegisterInfo;
+class SIInstrInfo;
+class MachineInstr;
+class MachineBasicBlock;
+template<typename GraphType>
+class GraphWriter;
+class SUnit;
+class IntEqClasses;
+class Twine;
+
+using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
+
+// SubExp and BlockExpDag.
+struct SubExp {
+  // Keep original order for sunits.
+  std::vector<llvm::MachineInstr *> SUnits;
+  llvm::DenseSet<unsigned> TopRegs;
+  llvm::DenseSet<llvm::MachineInstr *> BottomRoots;
+  llvm::DenseSet<unsigned> BottomRegs;
+  bool bMultiDefOutput = false;
+  bool bHasTerminatorInst = false;
+  bool bUseIncomingReg = false;
+  bool bMoveIntoLoop = false;
+  bool bNotSafeToCopy = false;
+  bool bHasMemInst = false;
+  bool bHoist = false;
+  // If temp/out reg is used by inst not in the subExp, cannot move since not
+  // all users will be move. But OK to clone.
+  bool bCloneOnly = false;
+  bool bTouchSCC = false;
+  llvm::MachineBasicBlock *FromBB;
+  llvm::MachineBasicBlock *ToBB;
+  unsigned sInputSize;
+  unsigned vInputSize;
+  unsigned sOutputSize;
+  unsigned vOutputSize;
+  unsigned sMaxSize;
+  unsigned vMaxSize;
+  LiveSet inputLive;
+  LiveSet outputLive;
+  bool isSafeToMove(const llvm::MachineRegisterInfo &MRI, bool bMoveUp) const;
+  void calcMaxPressure(const llvm::MachineRegisterInfo &MRI,
+                       const llvm::SIRegisterInfo *SIRI);
+  void dump(const llvm::MachineRegisterInfo &MRI,
+            const llvm::SIRegisterInfo *SIRI) const;
+  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+};
+
+struct ExpDag {
+  ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
+         const llvm::SIInstrInfo *SIII,
+         const bool bJoinInput);
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+  const llvm::SIInstrInfo *SIII;
+  const bool bJoinInputToSubExp;
+
+  std::vector<llvm::SUnit> SUnits; ///< The scheduling units.
+  llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+  llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+  llvm::DenseMap<unsigned, llvm::SUnit *> InputSUnitMap;
+  llvm::DenseMap<llvm::SUnit *, unsigned> SUnitInputMap;
+  std::vector<SubExp> SubExps;
+  template <typename T>
+  void build(const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+             T &insts);
+  void dump();
+  void viewGraph(const llvm::Twine &Name, const llvm::Twine &Title) const;
+  /// Returns a label for an SUnit node in a visualization of the ScheduleDAG.
+  std::string getGraphNodeLabel(const llvm::SUnit *SU) const;
+  std::string getDAGName() const;
+  /// Adds custom features for a visualization of the ScheduleDAG.
+  void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+private:
+  template<typename T>
+  void initNodes(const LiveSet &InputLiveReg, T &insts);
+  void addDataDep(const llvm::SIRegisterInfo *SIRI);
+  void addCtrlDep();
+  void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
+                   const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+};
+
+struct BlockExpDag : public ExpDag {
+  BlockExpDag(llvm::MachineBasicBlock *B, llvm::LiveIntervals *LIS,
+              const llvm::MachineRegisterInfo &MRI,
+              const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+  llvm::LiveIntervals *LIS;
+  llvm::MachineBasicBlock *MBB;
+  llvm::DenseMap<llvm::SUnit *, LiveSet> DagPressureMap;
+  std::vector<std::vector<llvm::SUnit *>> SUnitsInSameDepth;
+  std::vector<SubExp> SubExps;
+  void build();
+  void buildWithPressure();
+private:
+  void buildAvail(const LiveSet &passThruSet,
+                  llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
+  void buildPressure(const LiveSet &StartLiveReg,
+                     const LiveSet &EndLiveReg);
+};
+
+void getRegBound(llvm::MachineBasicBlock *MBB,
+                 const llvm::MachineRegisterInfo &MRI,
+                 const llvm::SIRegisterInfo *SIRI,
+                 const llvm::SIInstrInfo *SIII, llvm::LiveIntervals *LIS,
+                 unsigned &MaxVGPR, unsigned &MaxSGRP);
+
+// Currently mix sgpr and vgpr when build lineage to avoid cycle.
+// This maybe waste registers.
+// Based on "Minimum Register Instruction Sequencing to Reduce Register Spills
+// in Out-of-Order Issue Superscalar Architectures".
+class HRB {
+public:
+  struct Lineage {
+    unsigned ID = 0;
+    const llvm::TargetRegisterClass *RC = nullptr;
+    llvm::SmallVector<llvm::SUnit *, 4> Nodes;
+    llvm::SUnit *getHead() const;
+    llvm::SUnit *getTail() const;
+    void addNode(llvm::SUnit *);
+    unsigned getSize() const;
+    unsigned length() const;
+  };
+  struct ColorResult {
+    llvm::DenseMap<llvm::SUnit *, unsigned> ColorMap;
+    llvm::DenseMap<llvm::SUnit *, unsigned> SizeMap;
+    llvm::DenseMap<llvm::SUnit *, unsigned> LineageMap;
+    llvm::DenseMap<unsigned, llvm::DenseSet<unsigned>> Conflicts;
+    llvm::DenseSet<unsigned> ShareColorLineages;
+    llvm::DenseSet<llvm::SUnit *> HeadSet;
+    llvm::DenseSet<llvm::SUnit *> TailSet;
+    llvm::DenseMap<llvm::SUnit *, llvm::SUnit *> HeadTailMap;
+    unsigned maxReg = 0;
+    unsigned maxVGPR = 0;
+    unsigned maxSGPR = 0;
+    void colorSU(llvm::SUnit *SU, unsigned color);
+    unsigned getLineage(llvm::SUnit *SU) const;
+    bool isConflict(const llvm::SUnit *SU0, unsigned Lineage) const;
+    bool isHead(llvm::SUnit *SU) const;
+    bool isTail(llvm::SUnit *SU) const;
+    const llvm::SUnit *getTail(llvm::SUnit *SU) const;
+    unsigned getColor(const llvm::SUnit *SU) const;
+    unsigned getSize(const llvm::SUnit *SU) const;
+  };
+  HRB(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI)
+      : MRI(MRI), SIRI(SIRI) {}
+
+  void buildLinear(std::vector<llvm::SUnit> &SUnits);
+  void buildConflict();
+  void buildReachRelation(llvm::ArrayRef<llvm::SUnit *> BotRoots);
+  llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> &getReachMap() {
+    return ReachMap;
+  }
+  bool canReach(llvm::SUnit *a, llvm::SUnit *b);
+  void updateReachForEdge(llvm::SUnit *a, llvm::SUnit *b,
+                          std::vector<llvm::SUnit> &SUnits);
+  void fusionLineages(std::vector<llvm::SUnit> &SUnits);
+  ColorResult &coloring();
+  void dump();
+  void dumpReachMap();
+
+private:
+  Lineage buildChain(llvm::SUnit *Node, std::vector<llvm::SUnit> &SUnits);
+  llvm::SUnit *findHeir(llvm::SUnit *SU, std::vector<llvm::SUnit> &SUnits);
+  bool isConflict(const Lineage &a, const Lineage &b);
+  bool canFuse(const Lineage &a, const Lineage &b);
+  bool tryFuse(Lineage &a, Lineage &b, std::vector<llvm::SUnit> &SUnits);
+  unsigned colorLineages(std::vector<Lineage *> &lineages,
+                         llvm::DenseMap<Lineage *, unsigned> &AllocMap,
+                         const unsigned Limit);
+
+  llvm::DenseSet<llvm::SUnit *> ChainedNodes;
+  llvm::DenseMap<llvm::SUnit *, llvm::DenseSet<llvm::SUnit *>> ReachMap;
+  bool bRecomputeHeight = false;
+  std::vector<Lineage> Lineages;
+  ColorResult Color;
+  const llvm::MachineRegisterInfo &MRI;
+  const llvm::SIRegisterInfo *SIRI;
+};
+
+std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
+                                          std::vector<llvm::SUnit *> &BRoots,
+                                          const llvm::MachineRegisterInfo &MRI,
+                                          const llvm::SIRegisterInfo *SIRI);
+
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 96062b30fc0127a..b88673d94a19157 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -507,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
+  initializeAMDGPUHotBlockRematerializePass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
new file mode 100644
index 000000000000000..c9172bae2cb4ad7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -0,0 +1,106 @@
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Build degree about VMem to help balance latency and pressure inside a
+/// block.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <vector>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+
+namespace llvm {
+class MachineBasicBlock;
+class SUnit;
+class SIInstrInfo;
+class MachineInstr;
+
+class SimpleDAG {
+public:
+  SimpleDAG(llvm::MachineBasicBlock &MBB, const llvm::SIInstrInfo *TII)
+      : SIII(TII), MBB(MBB) {}
+  std::vector<llvm::SUnit> SUnits;
+  // InstrInfo.
+  const llvm::SIInstrInfo *SIII;
+  llvm::DenseMap<llvm::MachineInstr *, llvm::SUnit *> MISUnitMap;
+  llvm::DenseMap<llvm::SUnit *, llvm::MachineInstr *> SUnitMIMap;
+  llvm::MachineBasicBlock &MBB;
+  void build();
+
+private:
+  void initNodes();
+  void addDependence();
+  void addCtrlDep();
+};
+
+
+// Collect height/depth for high latency mem ld, which only update height/depth
+// when cross high latency mem ld. Call the height/depth as VMem degree here.
+// The rule is sample and its user should has different degree.
+// For example
+// a = sample     // a has depth 0, height 3
+// b = sample a   // b has depth 1, height 2
+// c = sample c   // c has depth 2, height 1
+//   user of c    // user of c has depth 2, height 0
+//
+// For the purpose of in block reorder/remat, nothing will move/clone cross the
+// block. So do this after cross blk remat? In the middle of cross block remat
+// to help reach target when only move things cross blk cannot reach the target.
+// Reorder at the beginning? No pressure at that time? After get pressure, might
+// need to update max pressure.
+
+class VMemDegreeDAG {
+public:
+  VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
+              const llvm::SIInstrInfo *TII)
+      : SUnits(Units), SIII(TII) {}
+  std::vector<llvm::SUnit> &SUnits;
+  // InstrInfo.
+  const llvm::SIInstrInfo *SIII;
+  void build();
+
+
+  bool isHighLatency(const llvm::SUnit *SU) const;
+  bool isHighLatency(const llvm::MachineInstr *MI) const;
+  // height/depth based on Long latency inst.
+  std::vector<unsigned> VMemDataHeight;
+  std::vector<unsigned> VMemDataDepth;
+  // Full height/depth count non-data dependent too.
+  std::vector<unsigned> VMemFullHeight;
+  std::vector<unsigned> VMemFullDepth;
+  llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
+
+
+  void dump();
+
+private:
+  static constexpr unsigned kNoReg = -1;
+
+
+  std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+                            std::vector<unsigned> &VMemDepth, bool bDataOnly);
+  // Compute vmem height/depth.
+  void buildVMemDepthHeight();
+  void buildVMemDataDepthHeight();
+  void groupVmemSUnits();
+
+};
+
+
+
+// Split block based on vmem depth.
+void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
+
+}
+
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237edc..92a9b3b3748ca6f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRematerialize.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
@@ -80,10 +81,14 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
+  AMDGPUMIRUtils.cpp
+  AMDGPUMirDivergenceAnalysis.cpp
+  AMDGPUMirSyncDependenceAnalysis.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
+  AMDGPUOccupancyAndLatencyHelper.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
@@ -106,6 +111,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUSelectionDAGInfo.cpp
   AMDGPUSetWavePriority.cpp
   AMDGPUSplitModule.cpp
+  AMDGPUSubExpDag.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcbb1..aa4b3f948b726f3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -47,6 +47,10 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
+  unsigned getMaxSGPR() const {
+    return std::max(getSGPRNum(), getSGPRTuplesWeight());
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR32]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 933935a86f9f98f..cb10df2c3412906 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1313,6 +1313,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyDef(int Opc) const override;
+  bool isHighLatencyInstruction(const MachineInstr& MI) const {
+    return isHighLatencyDef(MI.getOpcode());
+  }
 
   /// Return the descriptor of the target-specific machine instruction
   /// that corresponds to the specified pseudo or native opcode.
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
new file mode 100644
index 000000000000000..e8a66b47ac732b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -0,0 +1,405 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# DEFS
+# CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni00:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div00]], implicit $exec
+# CHECK: %[[#div01:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni01:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div01]], implicit $exec
+# CHECK: %[[#div02:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni02:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div02]], implicit $exec
+# CHECK: %[[#div03:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni03:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div03]], implicit $exec
+# CHECK: %[[#div04:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni04:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div04]], implicit $exec
+# CHECK: %[[#div05:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni05:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div05]], implicit $exec
+# CHECK: %[[#div06:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni06:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div06]], implicit $exec
+# CHECK: %[[#div07:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni07:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div07]], implicit $exec
+# CHECK: %[[#div08:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni08:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div08]], implicit $exec
+# CHECK: %[[#div09:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni09:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div09]], implicit $exec
+# CHECK: %[[#div10:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni10:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div10]], implicit $exec
+# CHECK: %[[#div11:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni11:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div11]], implicit $exec
+# CHECK: %[[#div12:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni12:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div12]], implicit $exec
+# CHECK: %[[#div13:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni13:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div13]], implicit $exec
+# CHECK: %[[#div14:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni14:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div14]], implicit $exec
+# CHECK: %[[#div15:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni15:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div15]], implicit $exec
+# CHECK: %[[#div16:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni16:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div16]], implicit $exec
+# CHECK: %[[#div17:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni17:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div17]], implicit $exec
+# CHECK: %[[#div18:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni18:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div18]], implicit $exec
+# CHECK: %[[#div19:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni19:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div19]], implicit $exec
+# CHECK: %[[#div20:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni20:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div20]], implicit $exec
+# CHECK: %[[#div21:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni21:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div21]], implicit $exec
+# CHECK: %[[#div22:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni22:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div22]], implicit $exec
+# CHECK: %[[#div23:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni23:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div23]], implicit $exec
+# CHECK: %[[#div24:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni24:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div24]], implicit $exec
+# CHECK: %[[#div25:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni25:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div25]], implicit $exec
+# CHECK: %[[#div26:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni26:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div26]], implicit $exec
+# CHECK: %[[#div27:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni27:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div27]], implicit $exec
+# CHECK: %[[#div28:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni28:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div28]], implicit $exec
+# CHECK: %[[#div29:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni29:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div29]], implicit $exec
+# CHECK: %[[#div30:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni30:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div30]], implicit $exec
+# CHECK: %[[#div31:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni31:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div31]], implicit $exec
+# CHECK: %[[#div32:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni32:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div32]], implicit $exec
+# CHECK: %[[#div33:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni33:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div33]], implicit $exec
+# CHECK: %[[#div34:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni34:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div34]], implicit $exec
+# CHECK: %[[#div35:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni35:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div35]], implicit $exec
+# CHECK: %[[#div36:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni36:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div36]], implicit $exec
+# CHECK: %[[#div37:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni37:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div37]], implicit $exec
+# CHECK: %[[#div38:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni38:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div38]], implicit $exec
+# CHECK: %[[#div39:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni39:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div39]], implicit $exec
+# CHECK: %[[#div40:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni40:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div40]], implicit $exec
+# CHECK: %[[#div41:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni41:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div41]], implicit $exec
+# CHECK: %[[#div42:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni42:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div42]], implicit $exec
+# CHECK: %[[#div43:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni43:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div43]], implicit $exec
+# CHECK: %[[#div44:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni44:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div44]], implicit $exec
+# CHECK: %[[#div45:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni45:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div45]], implicit $exec
+# CHECK: %[[#div46:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni46:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div46]], implicit $exec
+# CHECK: %[[#div47:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni47:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div47]], implicit $exec
+# CHECK: %[[#div48:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni48:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div48]], implicit $exec
+# CHECK: %[[#div49:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni49:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div49]], implicit $exec
+# CHECK: %[[#div50:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni50:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div50]], implicit $exec
+# CHECK: %[[#div51:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni51:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div51]], implicit $exec
+# CHECK: %[[#div52:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni52:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div52]], implicit $exec
+# CHECK: %[[#div53:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni53:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div53]], implicit $exec
+# CHECK: %[[#div54:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni54:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div54]], implicit $exec
+# CHECK: %[[#div55:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni55:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div55]], implicit $exec
+# CHECK: %[[#div56:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni56:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div56]], implicit $exec
+# CHECK: %[[#div57:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni57:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div57]], implicit $exec
+# CHECK: %[[#div58:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni58:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div58]], implicit $exec
+# CHECK: %[[#div59:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+# CHECK: %[[#uni59:]]:sgpr_32 = V_READFIRSTLANE_B32 %[[#div59]], implicit $exec
+
+
+# USERS:
+# CHECK: %[[#div_00:]]:vgpr_32 = COPY %[[#uni00]]
+#CHECK: EXP 0, %[[#div_00]],
+# CHECK: %[[#div_01:]]:vgpr_32 = COPY %[[#uni01]]
+#CHECK: EXP 0, %[[#div_01]],
+# CHECK: %[[#div_02:]]:vgpr_32 = COPY %[[#uni02]]
+#CHECK: EXP 0, %[[#div_02]],
+# CHECK: %[[#div_03:]]:vgpr_32 = COPY %[[#uni03]]
+#CHECK: EXP 0, %[[#div_03]],
+# CHECK: %[[#div_04:]]:vgpr_32 = COPY %[[#uni04]]
+#CHECK: EXP 0, %[[#div_04]],
+# CHECK: %[[#div_05:]]:vgpr_32 = COPY %[[#uni05]]
+#CHECK: EXP 0, %[[#div_05]],
+# CHECK: %[[#div_06:]]:vgpr_32 = COPY %[[#uni06]]
+#CHECK: EXP 0, %[[#div_06]],
+# CHECK: %[[#div_07:]]:vgpr_32 = COPY %[[#uni07]]
+#CHECK: EXP 0, %[[#div_07]],
+# CHECK: %[[#div_08:]]:vgpr_32 = COPY %[[#uni08]]
+#CHECK: EXP 0, %[[#div_08]],
+# CHECK: %[[#div_09:]]:vgpr_32 = COPY %[[#uni09]]
+#CHECK: EXP 0, %[[#div_09]],
+# CHECK: %[[#div_10:]]:vgpr_32 = COPY %[[#uni10]]
+#CHECK: EXP 0, %[[#div_10]],
+# CHECK: %[[#div_11:]]:vgpr_32 = COPY %[[#uni11]]
+#CHECK: EXP 0, %[[#div_11]],
+# CHECK: %[[#div_12:]]:vgpr_32 = COPY %[[#uni12]]
+#CHECK: EXP 0, %[[#div_12]],
+# CHECK: %[[#div_13:]]:vgpr_32 = COPY %[[#uni13]]
+#CHECK: EXP 0, %[[#div_13]],
+# CHECK: %[[#div_14:]]:vgpr_32 = COPY %[[#uni14]]
+#CHECK: EXP 0, %[[#div_14]],
+# CHECK: %[[#div_15:]]:vgpr_32 = COPY %[[#uni15]]
+#CHECK: EXP 0, %[[#div_15]],
+# CHECK: %[[#div_16:]]:vgpr_32 = COPY %[[#uni16]]
+#CHECK: EXP 0, %[[#div_16]],
+# CHECK: %[[#div_17:]]:vgpr_32 = COPY %[[#uni17]]
+#CHECK: EXP 0, %[[#div_17]],
+# CHECK: %[[#div_18:]]:vgpr_32 = COPY %[[#uni18]]
+#CHECK: EXP 0, %[[#div_18]],
+# CHECK: %[[#div_19:]]:vgpr_32 = COPY %[[#uni19]]
+#CHECK: EXP 0, %[[#div_19]],
+# CHECK: %[[#div_20:]]:vgpr_32 = COPY %[[#uni20]]
+#CHECK: EXP 0, %[[#div_20]],
+# CHECK: %[[#div_21:]]:vgpr_32 = COPY %[[#uni21]]
+#CHECK: EXP 0, %[[#div_21]],
+# CHECK: %[[#div_22:]]:vgpr_32 = COPY %[[#uni22]]
+#CHECK: EXP 0, %[[#div_22]],
+# CHECK: %[[#div_23:]]:vgpr_32 = COPY %[[#uni23]]
+#CHECK: EXP 0, %[[#div_23]],
+# CHECK: %[[#div_24:]]:vgpr_32 = COPY %[[#uni24]]
+#CHECK: EXP 0, %[[#div_24]],
+# CHECK: %[[#div_25:]]:vgpr_32 = COPY %[[#uni25]]
+#CHECK: EXP 0, %[[#div_25]],
+# CHECK: %[[#div_26:]]:vgpr_32 = COPY %[[#uni26]]
+#CHECK: EXP 0, %[[#div_26]],
+# CHECK: %[[#div_27:]]:vgpr_32 = COPY %[[#uni27]]
+#CHECK: EXP 0, %[[#div_27]],
+# CHECK: %[[#div_28:]]:vgpr_32 = COPY %[[#uni28]]
+#CHECK: EXP 0, %[[#div_28]],
+# CHECK: %[[#div_29:]]:vgpr_32 = COPY %[[#uni29]]
+#CHECK: EXP 0, %[[#div_29]],
+# CHECK: %[[#div_30:]]:vgpr_32 = COPY %[[#uni30]]
+#CHECK: EXP 0, %[[#div_30]],
+# CHECK: %[[#div_31:]]:vgpr_32 = COPY %[[#uni31]]
+#CHECK: EXP 0, %[[#div_31]],
+# CHECK: %[[#div_32:]]:vgpr_32 = COPY %[[#uni32]]
+#CHECK: EXP 0, %[[#div_32]],
+# CHECK: %[[#div_33:]]:vgpr_32 = COPY %[[#uni33]]
+#CHECK: EXP 0, %[[#div_33]],
+# CHECK: %[[#div_34:]]:vgpr_32 = COPY %[[#uni34]]
+#CHECK: EXP 0, %[[#div_34]],
+# CHECK: %[[#div_35:]]:vgpr_32 = COPY %[[#uni35]]
+#CHECK: EXP 0, %[[#div_35]],
+# CHECK: %[[#div_36:]]:vgpr_32 = COPY %[[#uni36]]
+#CHECK: EXP 0, %[[#div_36]],
+# CHECK: %[[#div_37:]]:vgpr_32 = COPY %[[#uni37]]
+#CHECK: EXP 0, %[[#div_37]],
+# CHECK: %[[#div_38:]]:vgpr_32 = COPY %[[#uni38]]
+#CHECK: EXP 0, %[[#div_38]],
+# CHECK: %[[#div_39:]]:vgpr_32 = COPY %[[#uni39]]
+#CHECK: EXP 0, %[[#div_39]],
+# CHECK: %[[#div_40:]]:vgpr_32 = COPY %[[#uni40]]
+#CHECK: EXP 0, %[[#div_40]],
+# CHECK: %[[#div_41:]]:vgpr_32 = COPY %[[#uni41]]
+#CHECK: EXP 0, %[[#div_41]],
+# CHECK: %[[#div_42:]]:vgpr_32 = COPY %[[#uni42]]
+#CHECK: EXP 0, %[[#div_42]],
+# CHECK: %[[#div_43:]]:vgpr_32 = COPY %[[#uni43]]
+#CHECK: EXP 0, %[[#div_43]],
+# CHECK: %[[#div_44:]]:vgpr_32 = COPY %[[#uni44]]
+#CHECK: EXP 0, %[[#div_44]],
+# CHECK: %[[#div_45:]]:vgpr_32 = COPY %[[#uni45]]
+#CHECK: EXP 0, %[[#div_45]],
+# CHECK: %[[#div_46:]]:vgpr_32 = COPY %[[#uni46]]
+#CHECK: EXP 0, %[[#div_46]],
+# CHECK: %[[#div_47:]]:vgpr_32 = COPY %[[#uni47]]
+#CHECK: EXP 0, %[[#div_47]],
+# CHECK: %[[#div_48:]]:vgpr_32 = COPY %[[#uni48]]
+#CHECK: EXP 0, %[[#div_48]],
+# CHECK: %[[#div_49:]]:vgpr_32 = COPY %[[#uni49]]
+#CHECK: EXP 0, %[[#div_49]],
+# CHECK: %[[#div_50:]]:vgpr_32 = COPY %[[#uni50]]
+#CHECK: EXP 0, %[[#div_50]],
+# CHECK: %[[#div_51:]]:vgpr_32 = COPY %[[#uni51]]
+#CHECK: EXP 0, %[[#div_51]],
+# CHECK: %[[#div_52:]]:vgpr_32 = COPY %[[#uni52]]
+#CHECK: EXP 0, %[[#div_52]],
+# CHECK: %[[#div_53:]]:vgpr_32 = COPY %[[#uni53]]
+#CHECK: EXP 0, %[[#div_53]],
+# CHECK: %[[#div_54:]]:vgpr_32 = COPY %[[#uni54]]
+#CHECK: EXP 0, %[[#div_54]],
+# CHECK: %[[#div_55:]]:vgpr_32 = COPY %[[#uni55]]
+#CHECK: EXP 0, %[[#div_55]],
+# CHECK: %[[#div_56:]]:vgpr_32 = COPY %[[#uni56]]
+#CHECK: EXP 0, %[[#div_56]],
+# CHECK: %[[#div_57:]]:vgpr_32 = COPY %[[#uni57]]
+#CHECK: EXP 0, %[[#div_57]],
+# CHECK: %[[#div_58:]]:vgpr_32 = COPY %[[#uni58]]
+#CHECK: EXP 0, %[[#div_58]],
+# CHECK: %[[#div_59:]]:vgpr_32 = COPY %[[#uni59]]
+#CHECK: EXP 0, %[[#div_59]],
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr8, $vgpr0, $vgpr1
+
+    %1000:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1001:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1002:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1003:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1004:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1005:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1006:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1007:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1008:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1009:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %1059, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %99:vgpr_32 = COPY %1058
+    S_BRANCH %bb.2
+
+  bb.2:
+    %1:vgpr_32 = IMPLICIT_DEF
+    EXP 0, killed %1000, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1001, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1002, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1003, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1004, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1005, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1006, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1007, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1008, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1009, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %1, %1, %1, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %1, %1, %1, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...

>From 280571f2da195c1bd53e47c6f676999214233a80 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Thu, 6 Feb 2025 13:52:02 -0800
Subject: [PATCH 2/3] Fixed build, and added simple tests that exercise major
 code paths

---
 .../AMDGPU/AMDGPUHotBlockRematerialize.cpp    |   8 +-
 .../test/CodeGen/AMDGPU/remat/group_remat.mir | 507 ++++++++++++++
 .../AMDGPU/remat/group_remat_with_uses.mir    | 641 ++++++++++++++++++
 .../test/CodeGen/AMDGPU/remat/simple_sgpr.mir | 450 ++++++++++++
 4 files changed, 1603 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 44ebaa2d51bec19..8647185bf5d51b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -291,7 +291,7 @@ unsigned CollectFnPressure(
     MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
     const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
     RematStatus &status) {
-  unsigned TgtOcc = ST->getOccupancyWithLocalMemSize(MF);
+  unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
   // If only have one block, input/ouput virtual live set are empty.
   if (MF.size() > 1) {
     // Build input output live reg first.
@@ -1351,7 +1351,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
   bool bForceRematSgpr = bSGPRSpill | status.bNotBalance;
 
   // If bound by lds, skip.
-  if (status.TargetOcc > ST->getOccupancyWithLocalMemSize(MF) &&
+  if (status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !bForceRematSgpr)
     return false;
 
@@ -1663,6 +1663,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     Register OpReg = Op.getReg();
     if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
       continue;
+    if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
+      continue;
     if (IsImplicitUseOfReg(Op, AMDGPU::M0) && isPhyRegUniqueDef(OpReg, MRI))
       continue;
     // Alow unused scc define.
@@ -4454,7 +4456,7 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
   }
 
   // If bound by lds, skip.
-  if ((status.TargetOcc + 1) > ST->getOccupancyWithLocalMemSize(MF) &&
+  if ((status.TargetOcc + 1) > ST->getOccupancyWithWorkGroupSizes(MF).second &&
       !bSGPRSpill)
     return false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
new file mode 100644
index 000000000000000..7f3483c66a5d9b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
@@ -0,0 +1,507 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+
+# Check that the whole expression gets moved to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+    %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+    %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+    %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+    %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+    %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+    %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+    %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+    %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+    %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+    %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+    %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+    %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+    %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+    %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+    %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+    %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+    %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+    %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+    %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+    %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+    %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+    %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+    %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+    %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+    %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+    %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+    %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+    %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+    %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+    %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+    %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+    %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+    %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+    %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+    %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+    %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+    %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+    %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+    %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+    %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+    %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+    %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+    %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+    %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+    %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+    %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+    %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+    %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+    %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+    %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+    %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+    %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+    %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+    %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+    %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+    %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+    %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+    %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+    %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+    %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+    %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+    %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+    %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+    %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+    %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+    %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+    %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+    %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+    %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+    %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+    %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+    %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+    %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+    %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+    %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+    %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+    %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+    %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+    %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+    %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+    %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+    %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+    %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+    %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+    %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+    %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+    %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+    %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+    %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+    %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+    %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+    %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+    %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+    %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+    %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+    %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+    %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+    %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+    %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+    %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+    %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+    %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+    %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+    %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+    %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+    %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+    %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+    %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+    %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+    %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+    %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+    %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+    %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+    %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+    %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+    %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+    %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+    %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+    %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+    %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+    %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+    %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+    %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+    %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+    %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+    %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+    %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+    %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+    %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+    %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+    %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
new file mode 100644
index 000000000000000..637a683bdd041d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
@@ -0,0 +1,641 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat -amdgpu-remat-enable-sub-exp-remat-aggressive | FileCheck %s
+
+# Check that the whole expression gets CLONED to uses in bb.2.
+# CHECK: bb.0:
+# CHECK: %[[#r500:]]:vgpr_32 = V_MOV_B32_e32 $vgpr0
+# CHECK: %[[#r501:]]:vgpr_32 = V_MOV_B32_e32 $vgpr1
+# CHECK: bb.1:
+# CHECK: bb.2:
+# CHECK: %[[#r502:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r500]]
+# CHECK: %[[#r503:]]:vgpr_32 = V_MUL_F32_e32 %[[#r500]], %[[#r501]]
+# CHECK: %[[#r504:]]:vgpr_32 = V_MUL_F32_e32 %[[#r501]], %[[#r501]]
+# CHECK: %[[#r505:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r502]]
+# CHECK: %[[#r506:]]:vgpr_32 = V_MUL_F32_e32 %[[#r502]], %[[#r503]]
+# CHECK: %[[#r507:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r503]]
+# CHECK: %[[#r508:]]:vgpr_32 = V_MUL_F32_e32 %[[#r503]], %[[#r504]]
+# CHECK: %[[#r509:]]:vgpr_32 = V_MUL_F32_e32 %[[#r504]], %[[#r504]]
+# CHECK: %[[#r5010:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r505]]
+# CHECK: %[[#r5011:]]:vgpr_32 = V_MUL_F32_e32 %[[#r505]], %[[#r506]]
+# CHECK: %[[#r5012:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r506]]
+# CHECK: %[[#r5013:]]:vgpr_32 = V_MUL_F32_e32 %[[#r506]], %[[#r507]]
+# CHECK: %[[#r5014:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r507]]
+# CHECK: %[[#r5015:]]:vgpr_32 = V_MUL_F32_e32 %[[#r507]], %[[#r508]]
+# CHECK: %[[#r5016:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r508]]
+# CHECK: %[[#r5017:]]:vgpr_32 = V_MUL_F32_e32 %[[#r508]], %[[#r509]]
+# CHECK: %[[#r5018:]]:vgpr_32 = V_MUL_F32_e32 %[[#r509]], %[[#r509]]
+# CHECK: %[[#r5019:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5010]]
+# CHECK: %[[#r5020:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5010]], %[[#r5011]]
+# CHECK: %[[#r5021:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5011]]
+# CHECK: %[[#r5022:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5011]], %[[#r5012]]
+# CHECK: %[[#r5023:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5012]]
+# CHECK: %[[#r5024:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5012]], %[[#r5013]]
+# CHECK: %[[#r5025:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5013]]
+# CHECK: %[[#r5026:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5013]], %[[#r5014]]
+# CHECK: %[[#r5027:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5014]]
+# CHECK: %[[#r5028:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5014]], %[[#r5015]]
+# CHECK: %[[#r5029:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5015]]
+# CHECK: %[[#r5030:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5015]], %[[#r5016]]
+# CHECK: %[[#r5031:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5016]]
+# CHECK: %[[#r5032:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5016]], %[[#r5017]]
+# CHECK: %[[#r5033:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5017]]
+# CHECK: %[[#r5034:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5017]], %[[#r5018]]
+# CHECK: %[[#r5035:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5018]], %[[#r5018]]
+# CHECK: %[[#r5036:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5019]]
+# CHECK: %[[#r5037:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5019]], %[[#r5020]]
+# CHECK: %[[#r5038:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5020]]
+# CHECK: %[[#r5039:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5020]], %[[#r5021]]
+# CHECK: %[[#r5040:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5021]]
+# CHECK: %[[#r5041:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5021]], %[[#r5022]]
+# CHECK: %[[#r5042:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5022]]
+# CHECK: %[[#r5043:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5022]], %[[#r5023]]
+# CHECK: %[[#r5044:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5023]]
+# CHECK: %[[#r5045:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5023]], %[[#r5024]]
+# CHECK: %[[#r5046:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5024]]
+# CHECK: %[[#r5047:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5024]], %[[#r5025]]
+# CHECK: %[[#r5048:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5025]]
+# CHECK: %[[#r5049:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5025]], %[[#r5026]]
+# CHECK: %[[#r5050:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5026]]
+# CHECK: %[[#r5051:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5026]], %[[#r5027]]
+# CHECK: %[[#r5052:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5027]]
+# CHECK: %[[#r5053:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5027]], %[[#r5028]]
+# CHECK: %[[#r5054:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5028]]
+# CHECK: %[[#r5055:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5028]], %[[#r5029]]
+# CHECK: %[[#r5056:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5029]]
+# CHECK: %[[#r5057:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5029]], %[[#r5030]]
+# CHECK: %[[#r5058:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5030]]
+# CHECK: %[[#r5059:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5030]], %[[#r5031]]
+# CHECK: %[[#r5060:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5031]]
+# CHECK: %[[#r5061:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5031]], %[[#r5032]]
+# CHECK: %[[#r5062:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5032]]
+# CHECK: %[[#r5063:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5032]], %[[#r5033]]
+# CHECK: %[[#r5064:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5033]]
+# CHECK: %[[#r5065:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5033]], %[[#r5034]]
+# CHECK: %[[#r5066:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5034]]
+# CHECK: %[[#r5067:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5034]], %[[#r5035]]
+# CHECK: %[[#r5068:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5035]], %[[#r5035]]
+# CHECK: %[[#r5069:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5036]]
+# CHECK: %[[#r5070:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5036]], %[[#r5037]]
+# CHECK: %[[#r5071:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5037]]
+# CHECK: %[[#r5072:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5037]], %[[#r5038]]
+# CHECK: %[[#r5073:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5038]]
+# CHECK: %[[#r5074:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5038]], %[[#r5039]]
+# CHECK: %[[#r5075:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5039]]
+# CHECK: %[[#r5076:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5039]], %[[#r5040]]
+# CHECK: %[[#r5077:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5040]]
+# CHECK: %[[#r5078:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5040]], %[[#r5041]]
+# CHECK: %[[#r5079:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5041]]
+# CHECK: %[[#r5080:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5041]], %[[#r5042]]
+# CHECK: %[[#r5081:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5042]]
+# CHECK: %[[#r5082:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5042]], %[[#r5043]]
+# CHECK: %[[#r5083:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5043]]
+# CHECK: %[[#r5084:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5043]], %[[#r5044]]
+# CHECK: %[[#r5085:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5044]]
+# CHECK: %[[#r5086:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5044]], %[[#r5045]]
+# CHECK: %[[#r5087:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5045]]
+# CHECK: %[[#r5088:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5045]], %[[#r5046]]
+# CHECK: %[[#r5089:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5046]]
+# CHECK: %[[#r5090:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5046]], %[[#r5047]]
+# CHECK: %[[#r5091:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5047]]
+# CHECK: %[[#r5092:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5047]], %[[#r5048]]
+# CHECK: %[[#r5093:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5048]]
+# CHECK: %[[#r5094:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5048]], %[[#r5049]]
+# CHECK: %[[#r5095:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5049]]
+# CHECK: %[[#r5096:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5049]], %[[#r5050]]
+# CHECK: %[[#r5097:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5050]]
+# CHECK: %[[#r5098:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5050]], %[[#r5051]]
+# CHECK: %[[#r5099:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5051]]
+# CHECK: %[[#r50100:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5051]], %[[#r5052]]
+# CHECK: %[[#r50101:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5052]]
+# CHECK: %[[#r50102:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5052]], %[[#r5053]]
+# CHECK: %[[#r50103:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5053]]
+# CHECK: %[[#r50104:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5053]], %[[#r5054]]
+# CHECK: %[[#r50105:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5054]]
+# CHECK: %[[#r50106:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5054]], %[[#r5055]]
+# CHECK: %[[#r50107:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5055]]
+# CHECK: %[[#r50108:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5055]], %[[#r5056]]
+# CHECK: %[[#r50109:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5056]]
+# CHECK: %[[#r50110:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5056]], %[[#r5057]]
+# CHECK: %[[#r50111:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5057]]
+# CHECK: %[[#r50112:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5057]], %[[#r5058]]
+# CHECK: %[[#r50113:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5058]]
+# CHECK: %[[#r50114:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5058]], %[[#r5059]]
+# CHECK: %[[#r50115:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5059]]
+# CHECK: %[[#r50116:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5059]], %[[#r5060]]
+# CHECK: %[[#r50117:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5060]]
+# CHECK: %[[#r50118:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5060]], %[[#r5061]]
+# CHECK: %[[#r50119:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5061]]
+# CHECK: %[[#r50120:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5061]], %[[#r5062]]
+# CHECK: %[[#r50121:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5062]]
+# CHECK: %[[#r50122:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5062]], %[[#r5063]]
+# CHECK: %[[#r50123:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5063]]
+# CHECK: %[[#r50124:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5063]], %[[#r5064]]
+# CHECK: %[[#r50125:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5064]]
+# CHECK: %[[#r50126:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5064]], %[[#r5065]]
+# CHECK: %[[#r50127:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5065]]
+# CHECK: %[[#r50128:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5065]], %[[#r5066]]
+# CHECK: %[[#r50129:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5066]]
+# CHECK: %[[#r50130:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5066]], %[[#r5067]]
+# CHECK: %[[#r50131:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5067]]
+# CHECK: %[[#r50132:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5067]], %[[#r5068]]
+# CHECK: %[[#r50133:]]:vgpr_32 = V_MUL_F32_e32 %[[#r5068]], %[[#r5068]]
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+
+    %500:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %501:vgpr_32 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    %502:vgpr_32 = V_MUL_F32_e32 %500, %500, implicit $exec, implicit $mode
+    %503:vgpr_32 = V_MUL_F32_e32 %500, %501, implicit $exec, implicit $mode
+    %504:vgpr_32 = V_MUL_F32_e32 %501, %501, implicit $exec, implicit $mode
+    %505:vgpr_32 = V_MUL_F32_e32 %502, %502, implicit $exec, implicit $mode
+    %506:vgpr_32 = V_MUL_F32_e32 %502, %503, implicit $exec, implicit $mode
+    %507:vgpr_32 = V_MUL_F32_e32 %503, %503, implicit $exec, implicit $mode
+    %508:vgpr_32 = V_MUL_F32_e32 %503, %504, implicit $exec, implicit $mode
+    %509:vgpr_32 = V_MUL_F32_e32 %504, %504, implicit $exec, implicit $mode
+    %5010:vgpr_32 = V_MUL_F32_e32 %505, %505, implicit $exec, implicit $mode
+    %5011:vgpr_32 = V_MUL_F32_e32 %505, %506, implicit $exec, implicit $mode
+    %5012:vgpr_32 = V_MUL_F32_e32 %506, %506, implicit $exec, implicit $mode
+    %5013:vgpr_32 = V_MUL_F32_e32 %506, %507, implicit $exec, implicit $mode
+    %5014:vgpr_32 = V_MUL_F32_e32 %507, %507, implicit $exec, implicit $mode
+    %5015:vgpr_32 = V_MUL_F32_e32 %507, %508, implicit $exec, implicit $mode
+    %5016:vgpr_32 = V_MUL_F32_e32 %508, %508, implicit $exec, implicit $mode
+    %5017:vgpr_32 = V_MUL_F32_e32 %508, %509, implicit $exec, implicit $mode
+    %5018:vgpr_32 = V_MUL_F32_e32 %509, %509, implicit $exec, implicit $mode
+    %5019:vgpr_32 = V_MUL_F32_e32 %5010, %5010, implicit $exec, implicit $mode
+    %5020:vgpr_32 = V_MUL_F32_e32 %5010, %5011, implicit $exec, implicit $mode
+    %5021:vgpr_32 = V_MUL_F32_e32 %5011, %5011, implicit $exec, implicit $mode
+    %5022:vgpr_32 = V_MUL_F32_e32 %5011, %5012, implicit $exec, implicit $mode
+    %5023:vgpr_32 = V_MUL_F32_e32 %5012, %5012, implicit $exec, implicit $mode
+    %5024:vgpr_32 = V_MUL_F32_e32 %5012, %5013, implicit $exec, implicit $mode
+    %5025:vgpr_32 = V_MUL_F32_e32 %5013, %5013, implicit $exec, implicit $mode
+    %5026:vgpr_32 = V_MUL_F32_e32 %5013, %5014, implicit $exec, implicit $mode
+    %5027:vgpr_32 = V_MUL_F32_e32 %5014, %5014, implicit $exec, implicit $mode
+    %5028:vgpr_32 = V_MUL_F32_e32 %5014, %5015, implicit $exec, implicit $mode
+    %5029:vgpr_32 = V_MUL_F32_e32 %5015, %5015, implicit $exec, implicit $mode
+    %5030:vgpr_32 = V_MUL_F32_e32 %5015, %5016, implicit $exec, implicit $mode
+    %5031:vgpr_32 = V_MUL_F32_e32 %5016, %5016, implicit $exec, implicit $mode
+    %5032:vgpr_32 = V_MUL_F32_e32 %5016, %5017, implicit $exec, implicit $mode
+    %5033:vgpr_32 = V_MUL_F32_e32 %5017, %5017, implicit $exec, implicit $mode
+    %5034:vgpr_32 = V_MUL_F32_e32 %5017, %5018, implicit $exec, implicit $mode
+    %5035:vgpr_32 = V_MUL_F32_e32 %5018, %5018, implicit $exec, implicit $mode
+    %5036:vgpr_32 = V_MUL_F32_e32 %5019, %5019, implicit $exec, implicit $mode
+    %5037:vgpr_32 = V_MUL_F32_e32 %5019, %5020, implicit $exec, implicit $mode
+    %5038:vgpr_32 = V_MUL_F32_e32 %5020, %5020, implicit $exec, implicit $mode
+    %5039:vgpr_32 = V_MUL_F32_e32 %5020, %5021, implicit $exec, implicit $mode
+    %5040:vgpr_32 = V_MUL_F32_e32 %5021, %5021, implicit $exec, implicit $mode
+    %5041:vgpr_32 = V_MUL_F32_e32 %5021, %5022, implicit $exec, implicit $mode
+    %5042:vgpr_32 = V_MUL_F32_e32 %5022, %5022, implicit $exec, implicit $mode
+    %5043:vgpr_32 = V_MUL_F32_e32 %5022, %5023, implicit $exec, implicit $mode
+    %5044:vgpr_32 = V_MUL_F32_e32 %5023, %5023, implicit $exec, implicit $mode
+    %5045:vgpr_32 = V_MUL_F32_e32 %5023, %5024, implicit $exec, implicit $mode
+    %5046:vgpr_32 = V_MUL_F32_e32 %5024, %5024, implicit $exec, implicit $mode
+    %5047:vgpr_32 = V_MUL_F32_e32 %5024, %5025, implicit $exec, implicit $mode
+    %5048:vgpr_32 = V_MUL_F32_e32 %5025, %5025, implicit $exec, implicit $mode
+    %5049:vgpr_32 = V_MUL_F32_e32 %5025, %5026, implicit $exec, implicit $mode
+    %5050:vgpr_32 = V_MUL_F32_e32 %5026, %5026, implicit $exec, implicit $mode
+    %5051:vgpr_32 = V_MUL_F32_e32 %5026, %5027, implicit $exec, implicit $mode
+    %5052:vgpr_32 = V_MUL_F32_e32 %5027, %5027, implicit $exec, implicit $mode
+    %5053:vgpr_32 = V_MUL_F32_e32 %5027, %5028, implicit $exec, implicit $mode
+    %5054:vgpr_32 = V_MUL_F32_e32 %5028, %5028, implicit $exec, implicit $mode
+    %5055:vgpr_32 = V_MUL_F32_e32 %5028, %5029, implicit $exec, implicit $mode
+    %5056:vgpr_32 = V_MUL_F32_e32 %5029, %5029, implicit $exec, implicit $mode
+    %5057:vgpr_32 = V_MUL_F32_e32 %5029, %5030, implicit $exec, implicit $mode
+    %5058:vgpr_32 = V_MUL_F32_e32 %5030, %5030, implicit $exec, implicit $mode
+    %5059:vgpr_32 = V_MUL_F32_e32 %5030, %5031, implicit $exec, implicit $mode
+    %5060:vgpr_32 = V_MUL_F32_e32 %5031, %5031, implicit $exec, implicit $mode
+    %5061:vgpr_32 = V_MUL_F32_e32 %5031, %5032, implicit $exec, implicit $mode
+    %5062:vgpr_32 = V_MUL_F32_e32 %5032, %5032, implicit $exec, implicit $mode
+    %5063:vgpr_32 = V_MUL_F32_e32 %5032, %5033, implicit $exec, implicit $mode
+    %5064:vgpr_32 = V_MUL_F32_e32 %5033, %5033, implicit $exec, implicit $mode
+    %5065:vgpr_32 = V_MUL_F32_e32 %5033, %5034, implicit $exec, implicit $mode
+    %5066:vgpr_32 = V_MUL_F32_e32 %5034, %5034, implicit $exec, implicit $mode
+    %5067:vgpr_32 = V_MUL_F32_e32 %5034, %5035, implicit $exec, implicit $mode
+    %5068:vgpr_32 = V_MUL_F32_e32 %5035, %5035, implicit $exec, implicit $mode
+    %5069:vgpr_32 = V_MUL_F32_e32 %5036, %5036, implicit $exec, implicit $mode
+    %5070:vgpr_32 = V_MUL_F32_e32 %5036, %5037, implicit $exec, implicit $mode
+    %5071:vgpr_32 = V_MUL_F32_e32 %5037, %5037, implicit $exec, implicit $mode
+    %5072:vgpr_32 = V_MUL_F32_e32 %5037, %5038, implicit $exec, implicit $mode
+    %5073:vgpr_32 = V_MUL_F32_e32 %5038, %5038, implicit $exec, implicit $mode
+    %5074:vgpr_32 = V_MUL_F32_e32 %5038, %5039, implicit $exec, implicit $mode
+    %5075:vgpr_32 = V_MUL_F32_e32 %5039, %5039, implicit $exec, implicit $mode
+    %5076:vgpr_32 = V_MUL_F32_e32 %5039, %5040, implicit $exec, implicit $mode
+    %5077:vgpr_32 = V_MUL_F32_e32 %5040, %5040, implicit $exec, implicit $mode
+    %5078:vgpr_32 = V_MUL_F32_e32 %5040, %5041, implicit $exec, implicit $mode
+    %5079:vgpr_32 = V_MUL_F32_e32 %5041, %5041, implicit $exec, implicit $mode
+    %5080:vgpr_32 = V_MUL_F32_e32 %5041, %5042, implicit $exec, implicit $mode
+    %5081:vgpr_32 = V_MUL_F32_e32 %5042, %5042, implicit $exec, implicit $mode
+    %5082:vgpr_32 = V_MUL_F32_e32 %5042, %5043, implicit $exec, implicit $mode
+    %5083:vgpr_32 = V_MUL_F32_e32 %5043, %5043, implicit $exec, implicit $mode
+    %5084:vgpr_32 = V_MUL_F32_e32 %5043, %5044, implicit $exec, implicit $mode
+    %5085:vgpr_32 = V_MUL_F32_e32 %5044, %5044, implicit $exec, implicit $mode
+    %5086:vgpr_32 = V_MUL_F32_e32 %5044, %5045, implicit $exec, implicit $mode
+    %5087:vgpr_32 = V_MUL_F32_e32 %5045, %5045, implicit $exec, implicit $mode
+    %5088:vgpr_32 = V_MUL_F32_e32 %5045, %5046, implicit $exec, implicit $mode
+    %5089:vgpr_32 = V_MUL_F32_e32 %5046, %5046, implicit $exec, implicit $mode
+    %5090:vgpr_32 = V_MUL_F32_e32 %5046, %5047, implicit $exec, implicit $mode
+    %5091:vgpr_32 = V_MUL_F32_e32 %5047, %5047, implicit $exec, implicit $mode
+    %5092:vgpr_32 = V_MUL_F32_e32 %5047, %5048, implicit $exec, implicit $mode
+    %5093:vgpr_32 = V_MUL_F32_e32 %5048, %5048, implicit $exec, implicit $mode
+    %5094:vgpr_32 = V_MUL_F32_e32 %5048, %5049, implicit $exec, implicit $mode
+    %5095:vgpr_32 = V_MUL_F32_e32 %5049, %5049, implicit $exec, implicit $mode
+    %5096:vgpr_32 = V_MUL_F32_e32 %5049, %5050, implicit $exec, implicit $mode
+    %5097:vgpr_32 = V_MUL_F32_e32 %5050, %5050, implicit $exec, implicit $mode
+    %5098:vgpr_32 = V_MUL_F32_e32 %5050, %5051, implicit $exec, implicit $mode
+    %5099:vgpr_32 = V_MUL_F32_e32 %5051, %5051, implicit $exec, implicit $mode
+    %50100:vgpr_32 = V_MUL_F32_e32 %5051, %5052, implicit $exec, implicit $mode
+    %50101:vgpr_32 = V_MUL_F32_e32 %5052, %5052, implicit $exec, implicit $mode
+    %50102:vgpr_32 = V_MUL_F32_e32 %5052, %5053, implicit $exec, implicit $mode
+    %50103:vgpr_32 = V_MUL_F32_e32 %5053, %5053, implicit $exec, implicit $mode
+    %50104:vgpr_32 = V_MUL_F32_e32 %5053, %5054, implicit $exec, implicit $mode
+    %50105:vgpr_32 = V_MUL_F32_e32 %5054, %5054, implicit $exec, implicit $mode
+    %50106:vgpr_32 = V_MUL_F32_e32 %5054, %5055, implicit $exec, implicit $mode
+    %50107:vgpr_32 = V_MUL_F32_e32 %5055, %5055, implicit $exec, implicit $mode
+    %50108:vgpr_32 = V_MUL_F32_e32 %5055, %5056, implicit $exec, implicit $mode
+    %50109:vgpr_32 = V_MUL_F32_e32 %5056, %5056, implicit $exec, implicit $mode
+    %50110:vgpr_32 = V_MUL_F32_e32 %5056, %5057, implicit $exec, implicit $mode
+    %50111:vgpr_32 = V_MUL_F32_e32 %5057, %5057, implicit $exec, implicit $mode
+    %50112:vgpr_32 = V_MUL_F32_e32 %5057, %5058, implicit $exec, implicit $mode
+    %50113:vgpr_32 = V_MUL_F32_e32 %5058, %5058, implicit $exec, implicit $mode
+    %50114:vgpr_32 = V_MUL_F32_e32 %5058, %5059, implicit $exec, implicit $mode
+    %50115:vgpr_32 = V_MUL_F32_e32 %5059, %5059, implicit $exec, implicit $mode
+    %50116:vgpr_32 = V_MUL_F32_e32 %5059, %5060, implicit $exec, implicit $mode
+    %50117:vgpr_32 = V_MUL_F32_e32 %5060, %5060, implicit $exec, implicit $mode
+    %50118:vgpr_32 = V_MUL_F32_e32 %5060, %5061, implicit $exec, implicit $mode
+    %50119:vgpr_32 = V_MUL_F32_e32 %5061, %5061, implicit $exec, implicit $mode
+    %50120:vgpr_32 = V_MUL_F32_e32 %5061, %5062, implicit $exec, implicit $mode
+    %50121:vgpr_32 = V_MUL_F32_e32 %5062, %5062, implicit $exec, implicit $mode
+    %50122:vgpr_32 = V_MUL_F32_e32 %5062, %5063, implicit $exec, implicit $mode
+    %50123:vgpr_32 = V_MUL_F32_e32 %5063, %5063, implicit $exec, implicit $mode
+    %50124:vgpr_32 = V_MUL_F32_e32 %5063, %5064, implicit $exec, implicit $mode
+    %50125:vgpr_32 = V_MUL_F32_e32 %5064, %5064, implicit $exec, implicit $mode
+    %50126:vgpr_32 = V_MUL_F32_e32 %5064, %5065, implicit $exec, implicit $mode
+    %50127:vgpr_32 = V_MUL_F32_e32 %5065, %5065, implicit $exec, implicit $mode
+    %50128:vgpr_32 = V_MUL_F32_e32 %5065, %5066, implicit $exec, implicit $mode
+    %50129:vgpr_32 = V_MUL_F32_e32 %5066, %5066, implicit $exec, implicit $mode
+    %50130:vgpr_32 = V_MUL_F32_e32 %5066, %5067, implicit $exec, implicit $mode
+    %50131:vgpr_32 = V_MUL_F32_e32 %5067, %5067, implicit $exec, implicit $mode
+    %50132:vgpr_32 = V_MUL_F32_e32 %5067, %5068, implicit $exec, implicit $mode
+    %50133:vgpr_32 = V_MUL_F32_e32 %5068, %5068, implicit $exec, implicit $mode
+    EXP 0, %500, %500, %500, %500, -1, -1, 15, implicit $exec
+    EXP 0, %501, %501, %501, %501, -1, -1, 15, implicit $exec
+    EXP 0, %502, %502, %502, %502, -1, -1, 15, implicit $exec
+    EXP 0, %503, %503, %503, %503, -1, -1, 15, implicit $exec
+    EXP 0, %504, %504, %504, %504, -1, -1, 15, implicit $exec
+    EXP 0, %505, %505, %505, %505, -1, -1, 15, implicit $exec
+    EXP 0, %506, %506, %506, %506, -1, -1, 15, implicit $exec
+    EXP 0, %507, %507, %507, %507, -1, -1, 15, implicit $exec
+    EXP 0, %508, %508, %508, %508, -1, -1, 15, implicit $exec
+    EXP 0, %509, %509, %509, %509, -1, -1, 15, implicit $exec
+    EXP 0, %5010, %5010, %5010, %5010, -1, -1, 15, implicit $exec
+    EXP 0, %5011, %5011, %5011, %5011, -1, -1, 15, implicit $exec
+    EXP 0, %5012, %5012, %5012, %5012, -1, -1, 15, implicit $exec
+    EXP 0, %5013, %5013, %5013, %5013, -1, -1, 15, implicit $exec
+    EXP 0, %5014, %5014, %5014, %5014, -1, -1, 15, implicit $exec
+    EXP 0, %5015, %5015, %5015, %5015, -1, -1, 15, implicit $exec
+    EXP 0, %5016, %5016, %5016, %5016, -1, -1, 15, implicit $exec
+    EXP 0, %5017, %5017, %5017, %5017, -1, -1, 15, implicit $exec
+    EXP 0, %5018, %5018, %5018, %5018, -1, -1, 15, implicit $exec
+    EXP 0, %5019, %5019, %5019, %5019, -1, -1, 15, implicit $exec
+    EXP 0, %5020, %5020, %5020, %5020, -1, -1, 15, implicit $exec
+    EXP 0, %5021, %5021, %5021, %5021, -1, -1, 15, implicit $exec
+    EXP 0, %5022, %5022, %5022, %5022, -1, -1, 15, implicit $exec
+    EXP 0, %5023, %5023, %5023, %5023, -1, -1, 15, implicit $exec
+    EXP 0, %5024, %5024, %5024, %5024, -1, -1, 15, implicit $exec
+    EXP 0, %5025, %5025, %5025, %5025, -1, -1, 15, implicit $exec
+    EXP 0, %5026, %5026, %5026, %5026, -1, -1, 15, implicit $exec
+    EXP 0, %5027, %5027, %5027, %5027, -1, -1, 15, implicit $exec
+    EXP 0, %5028, %5028, %5028, %5028, -1, -1, 15, implicit $exec
+    EXP 0, %5029, %5029, %5029, %5029, -1, -1, 15, implicit $exec
+    EXP 0, %5030, %5030, %5030, %5030, -1, -1, 15, implicit $exec
+    EXP 0, %5031, %5031, %5031, %5031, -1, -1, 15, implicit $exec
+    EXP 0, %5032, %5032, %5032, %5032, -1, -1, 15, implicit $exec
+    EXP 0, %5033, %5033, %5033, %5033, -1, -1, 15, implicit $exec
+    EXP 0, %5034, %5034, %5034, %5034, -1, -1, 15, implicit $exec
+    EXP 0, %5035, %5035, %5035, %5035, -1, -1, 15, implicit $exec
+    EXP 0, %5036, %5036, %5036, %5036, -1, -1, 15, implicit $exec
+    EXP 0, %5037, %5037, %5037, %5037, -1, -1, 15, implicit $exec
+    EXP 0, %5038, %5038, %5038, %5038, -1, -1, 15, implicit $exec
+    EXP 0, %5039, %5039, %5039, %5039, -1, -1, 15, implicit $exec
+    EXP 0, %5040, %5040, %5040, %5040, -1, -1, 15, implicit $exec
+    EXP 0, %5041, %5041, %5041, %5041, -1, -1, 15, implicit $exec
+    EXP 0, %5042, %5042, %5042, %5042, -1, -1, 15, implicit $exec
+    EXP 0, %5043, %5043, %5043, %5043, -1, -1, 15, implicit $exec
+    EXP 0, %5044, %5044, %5044, %5044, -1, -1, 15, implicit $exec
+    EXP 0, %5045, %5045, %5045, %5045, -1, -1, 15, implicit $exec
+    EXP 0, %5046, %5046, %5046, %5046, -1, -1, 15, implicit $exec
+    EXP 0, %5047, %5047, %5047, %5047, -1, -1, 15, implicit $exec
+    EXP 0, %5048, %5048, %5048, %5048, -1, -1, 15, implicit $exec
+    EXP 0, %5049, %5049, %5049, %5049, -1, -1, 15, implicit $exec
+    EXP 0, %5050, %5050, %5050, %5050, -1, -1, 15, implicit $exec
+    EXP 0, %5051, %5051, %5051, %5051, -1, -1, 15, implicit $exec
+    EXP 0, %5052, %5052, %5052, %5052, -1, -1, 15, implicit $exec
+    EXP 0, %5053, %5053, %5053, %5053, -1, -1, 15, implicit $exec
+    EXP 0, %5054, %5054, %5054, %5054, -1, -1, 15, implicit $exec
+    EXP 0, %5055, %5055, %5055, %5055, -1, -1, 15, implicit $exec
+    EXP 0, %5056, %5056, %5056, %5056, -1, -1, 15, implicit $exec
+    EXP 0, %5057, %5057, %5057, %5057, -1, -1, 15, implicit $exec
+    EXP 0, %5058, %5058, %5058, %5058, -1, -1, 15, implicit $exec
+    EXP 0, %5059, %5059, %5059, %5059, -1, -1, 15, implicit $exec
+    EXP 0, %5060, %5060, %5060, %5060, -1, -1, 15, implicit $exec
+    EXP 0, %5061, %5061, %5061, %5061, -1, -1, 15, implicit $exec
+    EXP 0, %5062, %5062, %5062, %5062, -1, -1, 15, implicit $exec
+    EXP 0, %5063, %5063, %5063, %5063, -1, -1, 15, implicit $exec
+    EXP 0, %5064, %5064, %5064, %5064, -1, -1, 15, implicit $exec
+    EXP 0, %5065, %5065, %5065, %5065, -1, -1, 15, implicit $exec
+    EXP 0, %5066, %5066, %5066, %5066, -1, -1, 15, implicit $exec
+    EXP 0, %5067, %5067, %5067, %5067, -1, -1, 15, implicit $exec
+    EXP 0, %5068, %5068, %5068, %5068, -1, -1, 15, implicit $exec
+    EXP 0, %5069, %5069, %5069, %5069, -1, -1, 15, implicit $exec
+    EXP 0, %5070, %5070, %5070, %5070, -1, -1, 15, implicit $exec
+    EXP 0, %5071, %5071, %5071, %5071, -1, -1, 15, implicit $exec
+    EXP 0, %5072, %5072, %5072, %5072, -1, -1, 15, implicit $exec
+    EXP 0, %5073, %5073, %5073, %5073, -1, -1, 15, implicit $exec
+    EXP 0, %5074, %5074, %5074, %5074, -1, -1, 15, implicit $exec
+    EXP 0, %5075, %5075, %5075, %5075, -1, -1, 15, implicit $exec
+    EXP 0, %5076, %5076, %5076, %5076, -1, -1, 15, implicit $exec
+    EXP 0, %5077, %5077, %5077, %5077, -1, -1, 15, implicit $exec
+    EXP 0, %5078, %5078, %5078, %5078, -1, -1, 15, implicit $exec
+    EXP 0, %5079, %5079, %5079, %5079, -1, -1, 15, implicit $exec
+    EXP 0, %5080, %5080, %5080, %5080, -1, -1, 15, implicit $exec
+    EXP 0, %5081, %5081, %5081, %5081, -1, -1, 15, implicit $exec
+    EXP 0, %5082, %5082, %5082, %5082, -1, -1, 15, implicit $exec
+    EXP 0, %5083, %5083, %5083, %5083, -1, -1, 15, implicit $exec
+    EXP 0, %5084, %5084, %5084, %5084, -1, -1, 15, implicit $exec
+    EXP 0, %5085, %5085, %5085, %5085, -1, -1, 15, implicit $exec
+    EXP 0, %5086, %5086, %5086, %5086, -1, -1, 15, implicit $exec
+    EXP 0, %5087, %5087, %5087, %5087, -1, -1, 15, implicit $exec
+    EXP 0, %5088, %5088, %5088, %5088, -1, -1, 15, implicit $exec
+    EXP 0, %5089, %5089, %5089, %5089, -1, -1, 15, implicit $exec
+    EXP 0, %5090, %5090, %5090, %5090, -1, -1, 15, implicit $exec
+    EXP 0, %5091, %5091, %5091, %5091, -1, -1, 15, implicit $exec
+    EXP 0, %5092, %5092, %5092, %5092, -1, -1, 15, implicit $exec
+    EXP 0, %5093, %5093, %5093, %5093, -1, -1, 15, implicit $exec
+    EXP 0, %5094, %5094, %5094, %5094, -1, -1, 15, implicit $exec
+    EXP 0, %5095, %5095, %5095, %5095, -1, -1, 15, implicit $exec
+    EXP 0, %5096, %5096, %5096, %5096, -1, -1, 15, implicit $exec
+    EXP 0, %5097, %5097, %5097, %5097, -1, -1, 15, implicit $exec
+    EXP 0, %5098, %5098, %5098, %5098, -1, -1, 15, implicit $exec
+    EXP 0, %5099, %5099, %5099, %5099, -1, -1, 15, implicit $exec
+    EXP 0, %50100, %50100, %50100, %50100, -1, -1, 15, implicit $exec
+    EXP 0, %50101, %50101, %50101, %50101, -1, -1, 15, implicit $exec
+    EXP 0, %50102, %50102, %50102, %50102, -1, -1, 15, implicit $exec
+    EXP 0, %50103, %50103, %50103, %50103, -1, -1, 15, implicit $exec
+    EXP 0, %50104, %50104, %50104, %50104, -1, -1, 15, implicit $exec
+    EXP 0, %50105, %50105, %50105, %50105, -1, -1, 15, implicit $exec
+    EXP 0, %50106, %50106, %50106, %50106, -1, -1, 15, implicit $exec
+    EXP 0, %50107, %50107, %50107, %50107, -1, -1, 15, implicit $exec
+    EXP 0, %50108, %50108, %50108, %50108, -1, -1, 15, implicit $exec
+    EXP 0, %50109, %50109, %50109, %50109, -1, -1, 15, implicit $exec
+    EXP 0, %50110, %50110, %50110, %50110, -1, -1, 15, implicit $exec
+    EXP 0, %50111, %50111, %50111, %50111, -1, -1, 15, implicit $exec
+    EXP 0, %50112, %50112, %50112, %50112, -1, -1, 15, implicit $exec
+    EXP 0, %50113, %50113, %50113, %50113, -1, -1, 15, implicit $exec
+    EXP 0, %50114, %50114, %50114, %50114, -1, -1, 15, implicit $exec
+    EXP 0, %50115, %50115, %50115, %50115, -1, -1, 15, implicit $exec
+    EXP 0, %50116, %50116, %50116, %50116, -1, -1, 15, implicit $exec
+    EXP 0, %50117, %50117, %50117, %50117, -1, -1, 15, implicit $exec
+    EXP 0, %50118, %50118, %50118, %50118, -1, -1, 15, implicit $exec
+    EXP 0, %50119, %50119, %50119, %50119, -1, -1, 15, implicit $exec
+    EXP 0, %50120, %50120, %50120, %50120, -1, -1, 15, implicit $exec
+    EXP 0, %50121, %50121, %50121, %50121, -1, -1, 15, implicit $exec
+    EXP 0, %50122, %50122, %50122, %50122, -1, -1, 15, implicit $exec
+    EXP 0, %50123, %50123, %50123, %50123, -1, -1, 15, implicit $exec
+    EXP 0, %50124, %50124, %50124, %50124, -1, -1, 15, implicit $exec
+    EXP 0, %50125, %50125, %50125, %50125, -1, -1, 15, implicit $exec
+    EXP 0, %50126, %50126, %50126, %50126, -1, -1, 15, implicit $exec
+    EXP 0, %50127, %50127, %50127, %50127, -1, -1, 15, implicit $exec
+    EXP 0, %50128, %50128, %50128, %50128, -1, -1, 15, implicit $exec
+    EXP 0, %50129, %50129, %50129, %50129, -1, -1, 15, implicit $exec
+    EXP 0, %50130, %50130, %50130, %50130, -1, -1, 15, implicit $exec
+    EXP 0, %50131, %50131, %50131, %50131, -1, -1, 15, implicit $exec
+    EXP 0, %50132, %50132, %50132, %50132, -1, -1, 15, implicit $exec
+    EXP 0, %50133, %50133, %50133, %50133, -1, -1, 15, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+
+    %8001:vgpr_32 = COPY %8000
+    %8002:vgpr_32 = COPY %8000
+    %8003:vgpr_32 = COPY %8000
+    %8004:vgpr_32 = COPY %8000
+    %8005:vgpr_32 = COPY %8000
+    %8006:vgpr_32 = COPY %8000
+    %8007:vgpr_32 = COPY %8000
+    %8008:vgpr_32 = COPY %8000
+    %8009:vgpr_32 = COPY %8000
+    %8010:vgpr_32 = COPY %8000
+    %8011:vgpr_32 = COPY %8000
+    %8012:vgpr_32 = COPY %8000
+    %8013:vgpr_32 = COPY %8000
+    %8014:vgpr_32 = COPY %8000
+    %8015:vgpr_32 = COPY %8000
+    %8016:vgpr_32 = COPY %8000
+    %8017:vgpr_32 = COPY %8000
+
+    %9001:vgpr_32 = COPY %8001
+    %9002:vgpr_32 = COPY %8002
+    %9003:vgpr_32 = COPY %8003
+    %9004:vgpr_32 = COPY %8004
+    %9005:vgpr_32 = COPY %8005
+    %9006:vgpr_32 = COPY %8006
+    %9007:vgpr_32 = COPY %8007
+    %9008:vgpr_32 = COPY %8008
+    %9009:vgpr_32 = COPY %8009
+    %9010:vgpr_32 = COPY %8010
+    %9011:vgpr_32 = COPY %8011
+    %9012:vgpr_32 = COPY %8012
+    %9013:vgpr_32 = COPY %8013
+    %9014:vgpr_32 = COPY %8014
+    %9015:vgpr_32 = COPY %8015
+    %9016:vgpr_32 = COPY %8016
+    %9017:vgpr_32 = COPY %8017
+
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    EXP 0, killed %500, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %501, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %502, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %503, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %504, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %505, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %506, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %507, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %508, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %509, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5063, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5064, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5065, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5066, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5067, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5068, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5069, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5070, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5071, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5072, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5073, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5074, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5075, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5076, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5077, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5078, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5079, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5080, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5081, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5082, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5083, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5084, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5085, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5086, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5087, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5088, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5089, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5090, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5091, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5092, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5093, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5094, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5095, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5096, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5097, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5098, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %5099, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50110, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50111, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50112, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50113, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50114, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50115, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50116, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50117, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50118, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50119, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50120, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50121, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50122, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50123, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50124, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50125, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50126, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50127, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50128, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50129, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50130, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50131, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50132, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %50133, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
new file mode 100644
index 000000000000000..bc2c97f91f46c67
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -0,0 +1,450 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
+
+# Check that the loads have been moved to the use
+# CHECK: bb.2:
+# CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
+# CHECK: %[[#reg1:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 16, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg1]], %{{.+}}, 16, 0
+# CHECK: %[[#reg2:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 32, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg2]], %{{.+}}, 32, 0
+# CHECK: %[[#reg3:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 48, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg3]], %{{.+}}, 48, 0
+# CHECK: %[[#reg4:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 64, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg4]], %{{.+}}, 64, 0
+# CHECK: %[[#reg5:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 80, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg5]], %{{.+}}, 80, 0
+# CHECK: %[[#reg6:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 96, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg6]], %{{.+}}, 96, 0
+# CHECK: %[[#reg7:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 112, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg7]], %{{.+}}, 112, 0
+# CHECK: %[[#reg8:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 128, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg8]], %{{.+}}, 128, 0
+# CHECK: %[[#reg9:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 144, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg9]], %{{.+}}, 144, 0
+# CHECK: %[[#reg10:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 160, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg10]], %{{.+}}, 160, 0
+# CHECK: %[[#reg11:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 176, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg11]], %{{.+}}, 176, 0
+# CHECK: %[[#reg12:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 192, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg12]], %{{.+}}, 192, 0
+# CHECK: %[[#reg13:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 208, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg13]], %{{.+}}, 208, 0
+# CHECK: %[[#reg14:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 224, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg14]], %{{.+}}, 224, 0
+# CHECK: %[[#reg15:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 240, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg15]], %{{.+}}, 240, 0
+# CHECK: %[[#reg16:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 256, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg16]], %{{.+}}, 256, 0
+# CHECK: %[[#reg17:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 272, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg17]], %{{.+}}, 272, 0
+# CHECK: %[[#reg18:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 288, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg18]], %{{.+}}, 288, 0
+# CHECK: %[[#reg19:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 304, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg19]], %{{.+}}, 304, 0
+# CHECK: %[[#reg20:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 320, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg20]], %{{.+}}, 320, 0
+# CHECK: %[[#reg21:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 336, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg21]], %{{.+}}, 336, 0
+# CHECK: %[[#reg22:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 352, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg22]], %{{.+}}, 352, 0
+# CHECK: %[[#reg23:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 368, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg23]], %{{.+}}, 368, 0
+# CHECK: %[[#reg24:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 384, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg24]], %{{.+}}, 384, 0
+# CHECK: %[[#reg25:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 400, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg25]], %{{.+}}, 400, 0
+# CHECK: %[[#reg26:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 416, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg26]], %{{.+}}, 416, 0
+# CHECK: %[[#reg27:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 432, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg27]], %{{.+}}, 432, 0
+# CHECK: %[[#reg28:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 448, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg28]], %{{.+}}, 448, 0
+# CHECK: %[[#reg29:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 464, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg29]], %{{.+}}, 464, 0
+# CHECK: %[[#reg30:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 480, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg30]], %{{.+}}, 480, 0
+# CHECK: %[[#reg31:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 496, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg31]], %{{.+}}, 496, 0
+# CHECK: %[[#reg32:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 512, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg32]], %{{.+}}, 512, 0
+# CHECK: %[[#reg33:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 528, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg33]], %{{.+}}, 528, 0
+# CHECK: %[[#reg34:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 544, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg34]], %{{.+}}, 544, 0
+# CHECK: %[[#reg35:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 560, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg35]], %{{.+}}, 560, 0
+# CHECK: %[[#reg36:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 576, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg36]], %{{.+}}, 576, 0
+# CHECK: %[[#reg37:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 592, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg37]], %{{.+}}, 592, 0
+# CHECK: %[[#reg38:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 608, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg38]], %{{.+}}, 608, 0
+# CHECK: %[[#reg39:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 624, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg39]], %{{.+}}, 624, 0
+# CHECK: %[[#reg40:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 640, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg40]], %{{.+}}, 640, 0
+# CHECK: %[[#reg41:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 656, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg41]], %{{.+}}, 656, 0
+# CHECK: %[[#reg42:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 672, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg42]], %{{.+}}, 672, 0
+# CHECK: %[[#reg43:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 688, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg43]], %{{.+}}, 688, 0
+# CHECK: %[[#reg44:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 704, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg44]], %{{.+}}, 704, 0
+# CHECK: %[[#reg45:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 720, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg45]], %{{.+}}, 720, 0
+# CHECK: %[[#reg46:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 736, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg46]], %{{.+}}, 736, 0
+# CHECK: %[[#reg47:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 752, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg47]], %{{.+}}, 752, 0
+# CHECK: %[[#reg48:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 768, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg48]], %{{.+}}, 768, 0
+# CHECK: %[[#reg49:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 784, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg49]], %{{.+}}, 784, 0
+# CHECK: %[[#reg50:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 800, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg50]], %{{.+}}, 800, 0
+# CHECK: %[[#reg51:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 816, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg51]], %{{.+}}, 816, 0
+# CHECK: %[[#reg52:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 832, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg52]], %{{.+}}, 832, 0
+# CHECK: %[[#reg53:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 848, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg53]], %{{.+}}, 848, 0
+# CHECK: %[[#reg54:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 864, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg54]], %{{.+}}, 864, 0
+# CHECK: %[[#reg55:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 880, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg55]], %{{.+}}, 880, 0
+# CHECK: %[[#reg56:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 896, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg56]], %{{.+}}, 896, 0
+# CHECK: %[[#reg57:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 912, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg57]], %{{.+}}, 912, 0
+# CHECK: %[[#reg58:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 928, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg58]], %{{.+}}, 928, 0
+# CHECK: %[[#reg59:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 944, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg59]], %{{.+}}, 944, 0
+# CHECK: %[[#reg60:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 960, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg60]], %{{.+}}, 960, 0
+# CHECK: %[[#reg61:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 976, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg61]], %{{.+}}, 976, 0
+# CHECK: %[[#reg62:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 992, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg62]], %{{.+}}, 992, 0
+# CHECK: %[[#reg63:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 1008, 0
+# CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg63]], %{{.+}}, 1008, 0
+
+
+--- |
+  source_filename = ".\main.ll"
+  define amdgpu_ps void @main() #1 {
+    ret void
+  }
+  attributes #1 = { "target-cpu"="gfx1010" }
+  !llvm.ident = !{!0}
+  !0 = !{!"clang version 3.7 (tags/RELEASE_370/final)"}
+...
+---
+name:            main
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$sgpr2' }
+  - { reg: '$sgpr3' }
+  - { reg: '$sgpr4' }
+  - { reg: '$sgpr5' }
+  - { reg: '$sgpr6' }
+  - { reg: '$sgpr7' }
+  - { reg: '$sgpr8' }
+  - { reg: '$sgpr8' }
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $vgpr0, $vgpr1
+
+    undef %0.sub0:sgpr_64 = COPY $sgpr0
+    undef %0.sub1:sgpr_64 = COPY $sgpr1
+
+    undef %1.sub0:sgpr_128 = COPY $sgpr4
+    undef %1.sub1:sgpr_128 = COPY $sgpr5
+    undef %1.sub2:sgpr_128 = COPY $sgpr6
+    undef %1.sub3:sgpr_128 = COPY $sgpr7
+
+    %3000:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0
+    %3001:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0
+    %3002:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 32, 0
+    %3003:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 48, 0
+    %3004:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 64, 0
+    %3005:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 80, 0
+    %3006:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 96, 0
+    %3007:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 112, 0
+    %3008:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 128, 0
+    %3009:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 144, 0
+    %30010:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 160, 0
+    %30011:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 176, 0
+    %30012:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 192, 0
+    %30013:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 208, 0
+    %30014:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 224, 0
+    %30015:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 240, 0
+    %30016:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 256, 0
+    %30017:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 272, 0
+    %30018:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 288, 0
+    %30019:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 304, 0
+    %30020:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 320, 0
+    %30021:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 336, 0
+    %30022:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 352, 0
+    %30023:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 368, 0
+    %30024:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 384, 0
+    %30025:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 400, 0
+    %30026:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 416, 0
+    %30027:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 432, 0
+    %30028:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 448, 0
+    %30029:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 464, 0
+    %30030:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 480, 0
+    %30031:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 496, 0
+    %30032:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 512, 0
+    %30033:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 528, 0
+    %30034:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 544, 0
+    %30035:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 560, 0
+    %30036:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 576, 0
+    %30037:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 592, 0
+    %30038:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 608, 0
+    %30039:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 624, 0
+    %30040:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 640, 0
+    %30041:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 656, 0
+    %30042:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 672, 0
+    %30043:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 688, 0
+    %30044:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 704, 0
+    %30045:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 720, 0
+    %30046:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 736, 0
+    %30047:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 752, 0
+    %30048:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 768, 0
+    %30049:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 784, 0
+    %30050:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 800, 0
+    %30051:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 816, 0
+    %30052:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 832, 0
+    %30053:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 848, 0
+    %30054:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 864, 0
+    %30055:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 880, 0
+    %30056:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 896, 0
+    %30057:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 912, 0
+    %30058:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 928, 0
+    %30059:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 944, 0
+    %30060:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 960, 0
+    %30061:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 976, 0
+    %30062:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 992, 0
+    %30063:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 1008, 0
+
+    %100:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %101:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %102:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %103:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %104:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %105:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %106:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %107:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %108:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %109:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1010:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1011:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1012:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1013:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1014:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1015:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1016:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1017:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1018:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1019:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1020:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1021:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1022:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1023:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1024:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1025:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1026:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1027:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1028:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1029:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1030:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1031:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1032:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1033:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1034:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1035:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1036:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1037:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1038:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1039:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1040:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1041:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1042:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1043:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1044:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1045:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1046:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1047:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1048:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1049:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1050:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1051:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1052:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1053:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1054:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1055:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1056:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1057:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1058:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1059:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1060:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1061:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1062:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1063:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+
+    %8000:vgpr_32 = IMPLICIT_DEF
+    %116:sreg_32_xm0 = nnan ninf nsz arcp contract afn reassoc V_CMP_GT_F32_e64 0, 0, 0, %8000, 0, implicit $exec, implicit $mode
+    $exec_lo = S_MOV_B32_term %116:sreg_32_xm0
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:  
+    successors: %bb.2
+    %8001:vgpr_32 = COPY %8000
+    S_BRANCH %bb.2
+
+  bb.2:
+
+    %3:vgpr_32 = IMPLICIT_DEF
+    S_BUFFER_STORE_DWORDX4_IMM killed %3000:sgpr_128, %1:sgpr_128, 0, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3001:sgpr_128, %1:sgpr_128, 16, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3002:sgpr_128, %1:sgpr_128, 32, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3003:sgpr_128, %1:sgpr_128, 48, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3004:sgpr_128, %1:sgpr_128, 64, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3005:sgpr_128, %1:sgpr_128, 80, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3006:sgpr_128, %1:sgpr_128, 96, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3007:sgpr_128, %1:sgpr_128, 112, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3008:sgpr_128, %1:sgpr_128, 128, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %3009:sgpr_128, %1:sgpr_128, 144, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30010:sgpr_128, %1:sgpr_128, 160, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30011:sgpr_128, %1:sgpr_128, 176, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30012:sgpr_128, %1:sgpr_128, 192, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30013:sgpr_128, %1:sgpr_128, 208, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30014:sgpr_128, %1:sgpr_128, 224, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30015:sgpr_128, %1:sgpr_128, 240, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30016:sgpr_128, %1:sgpr_128, 256, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30017:sgpr_128, %1:sgpr_128, 272, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30018:sgpr_128, %1:sgpr_128, 288, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30019:sgpr_128, %1:sgpr_128, 304, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30020:sgpr_128, %1:sgpr_128, 320, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30021:sgpr_128, %1:sgpr_128, 336, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30022:sgpr_128, %1:sgpr_128, 352, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30023:sgpr_128, %1:sgpr_128, 368, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30024:sgpr_128, %1:sgpr_128, 384, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30025:sgpr_128, %1:sgpr_128, 400, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30026:sgpr_128, %1:sgpr_128, 416, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30027:sgpr_128, %1:sgpr_128, 432, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30028:sgpr_128, %1:sgpr_128, 448, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30029:sgpr_128, %1:sgpr_128, 464, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30030:sgpr_128, %1:sgpr_128, 480, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30031:sgpr_128, %1:sgpr_128, 496, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30032:sgpr_128, %1:sgpr_128, 512, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30033:sgpr_128, %1:sgpr_128, 528, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30034:sgpr_128, %1:sgpr_128, 544, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30035:sgpr_128, %1:sgpr_128, 560, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30036:sgpr_128, %1:sgpr_128, 576, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30037:sgpr_128, %1:sgpr_128, 592, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30038:sgpr_128, %1:sgpr_128, 608, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30039:sgpr_128, %1:sgpr_128, 624, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30040:sgpr_128, %1:sgpr_128, 640, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30041:sgpr_128, %1:sgpr_128, 656, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30042:sgpr_128, %1:sgpr_128, 672, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30043:sgpr_128, %1:sgpr_128, 688, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30044:sgpr_128, %1:sgpr_128, 704, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30045:sgpr_128, %1:sgpr_128, 720, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30046:sgpr_128, %1:sgpr_128, 736, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30047:sgpr_128, %1:sgpr_128, 752, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30048:sgpr_128, %1:sgpr_128, 768, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30049:sgpr_128, %1:sgpr_128, 784, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30050:sgpr_128, %1:sgpr_128, 800, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30051:sgpr_128, %1:sgpr_128, 816, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30052:sgpr_128, %1:sgpr_128, 832, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30053:sgpr_128, %1:sgpr_128, 848, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30054:sgpr_128, %1:sgpr_128, 864, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30055:sgpr_128, %1:sgpr_128, 880, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30056:sgpr_128, %1:sgpr_128, 896, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30057:sgpr_128, %1:sgpr_128, 912, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30058:sgpr_128, %1:sgpr_128, 928, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30059:sgpr_128, %1:sgpr_128, 944, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30060:sgpr_128, %1:sgpr_128, 960, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30061:sgpr_128, %1:sgpr_128, 976, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30062:sgpr_128, %1:sgpr_128, 992, 0
+    S_BUFFER_STORE_DWORDX4_IMM killed %30063:sgpr_128, %1:sgpr_128, 1008, 0
+
+    EXP 0, killed %100, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %101, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %102, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %103, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %104, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %105, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %106, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %107, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %108, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %109, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1010, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1011, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1012, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1013, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1014, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1015, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1016, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1017, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1018, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1019, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1020, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1021, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1022, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1023, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1024, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1025, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1026, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1027, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1028, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1029, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1030, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1031, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1032, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1033, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1034, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1035, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1036, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1037, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1038, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1039, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1040, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1041, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1042, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1043, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1044, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1045, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1046, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1047, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1048, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1049, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1050, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1051, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1052, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1053, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1054, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1055, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1056, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1057, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1058, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1059, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1060, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1061, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1062, %3, %3, %3, -1, -1, 15, implicit $exec
+    EXP 0, killed %1063, %3, %3, %3, -1, -1, 15, implicit $exec
+
+
+    S_ENDPGM 0
+...

>From a13cfc4dcc49c810182bf5ca2bd3b3f0a40c75cd Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Thu, 6 Feb 2025 14:09:32 -0800
Subject: [PATCH 3/3] Test renames, only keeping the required flags for the
 tests

---
 .../remat/{group_remat_with_uses.mir => group_remat_clone.mir}  | 2 +-
 .../AMDGPU/remat/{group_remat.mir => group_remat_move.mir}      | 0
 llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir                  | 2 +-
 llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir             | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/remat/{group_remat_with_uses.mir => group_remat_clone.mir} (99%)
 rename llvm/test/CodeGen/AMDGPU/remat/{group_remat.mir => group_remat_move.mir} (100%)

diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
index 637a683bdd041d4..c99a1835454fd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/group_remat_with_uses.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/group_remat_clone.mir
@@ -638,4 +638,4 @@ body:             |
 
     S_ENDPGM 0
 ...
-    
\ No newline at end of file
+    
diff --git a/llvm/test/CodeGen/AMDGPU/remat/group_remat.mir b/llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/remat/group_remat.mir
rename to llvm/test/CodeGen/AMDGPU/remat/group_remat_move.mir
diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
index bc2c97f91f46c67..528515d235c8b60 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat | FileCheck %s
 
-# Check that the loads have been moved to the use
+# Check that the scalar loads have been moved to the use
 # CHECK: bb.2:
 # CHECK: %[[#reg0:]]:sgpr_128 = S_LOAD_DWORDX4_IMM %{{.+}}, 0, 0
 # CHECK: S_BUFFER_STORE_DWORDX4_IMM %[[#reg0]], %{{.+}}, 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
index e8a66b47ac732b5..53f59cc3f8b0b09 100644
--- a/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat/vector_to_scalar.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-hot-block-remat-aggressive -amdgpu-remat-enable-sub-exp-remat-aggressive -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-late-float-vtos -amdgpu-remat-enable-sub-exp-remat | FileCheck %s
 
 # DEFS
 # CHECK: %[[#div00:]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec



More information about the llvm-commits mailing list