[llvm] [AMDGPU] Added hot-block-rematerialize pass (PR #126331)

Fri Feb 7 17:31:06 PST 2025

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff 2eb44aa0a94a8d4230c1c9a0c306af16bfc92925 a13cfc4dcc49c810182bf5ca2bd3b3f0a40c75cd --extensions cpp,h -- llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h llvm/include/llvm/CodeGen/TargetRegisterInfo.h llvm/lib/CodeGen/TargetRegisterInfo.cpp llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/GCNRegPressure.h llvm/lib/Target/AMDGPU/SIInstrInfo.h
``````````

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff

diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index d37796a828..d21fb9e0dd 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -796,9 +796,10 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
     }
 
     if (BestIdx == 0) {
-      LLVM_DEBUG(dbgs() << "Unable to find minimal spanning sub register(s) for "
-                        << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
-                        << '\n');
+      LLVM_DEBUG(
+          dbgs() << "Unable to find minimal spanning sub register(s) for "
+                 << getRegClassName(RC) << " mask " << PrintLaneMask(mask)
+                 << '\n');
       assert(false && "Impossible to span reg class");
       return std::vector<unsigned>();
     }
@@ -809,4 +810,3 @@ TargetRegisterInfo::getMinimalSpanningSubRegIdxSetForLaneMask(
 
   return result;
 }
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
index 8647185bf5..afb4e0bafa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp
@@ -1,4 +1,5 @@
-//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block Rematerialize-------===//
+//===-- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot Block
+//Rematerialize-------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,24 +14,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUMIRUtils.h"
 #include "AMDGPUMirDivergenceAnalysis.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubExpDag.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUVMemDegreeDAG.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "GCNRegPressure.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "AMDGPUMIRUtils.h"
+#include "SIRegisterInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 
@@ -40,20 +41,24 @@
 using namespace llvm;
 
 static cl::opt<unsigned> TargetOccupancy("amdgpu-remat-target-occupancy");
-static cl::opt<bool> EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
-static cl::opt<bool> EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
-static cl::opt<bool> EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
+static cl::opt<bool>
+    EnableAggressive("amdgpu-remat-enable-hot-block-remat-aggressive");
+static cl::opt<bool>
+    EnableSubExpAggressive("amdgpu-remat-enable-sub-exp-remat-aggressive");
+static cl::opt<bool>
+    EnableSubExpClone("amdgpu-remat-enable-sub-exp-remat-clone");
 static cl::opt<bool> EnableVmemDegree("amdgpu-remat-enable-vmem-degree");
 static cl::opt<bool> EnableInBlockRemat("amdgpu-remat-enable-in-blk-remat");
 static cl::opt<bool> EnableSubExp("amdgpu-remat-enable-sub-exp-remat");
-static cl::opt<bool> EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
-static cl::opt<bool> EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
+static cl::opt<bool>
+    EnableUniformVectorToScalar("amdgpu-remat-enable-late-float-vtos");
+static cl::opt<bool>
+    EnableSubExpMinReg("amdgpu-remat-enable-sub-exp-remat-min-reg");
 
 namespace {
 typedef DenseSet<MachineInstr *> InstSet;
 typedef DenseSet<MachineBasicBlock *> BlockSet;
-template<typename T>
-using BlockMap = MapVector<MachineBasicBlock *, T>;
+template <typename T> using BlockMap = MapVector<MachineBasicBlock *, T>;
 
 // Rematerialize in a single pass instead of doing in register allcation.
 // If in register allocation, fail to rematerialize will cause spill.
@@ -62,9 +67,9 @@ class AMDGPUHotBlockRematerialize : public MachineFunctionPass {
 public:
   static char ID;
 
-  DenseSet<const MachineInstr*> TotalUniformInsts;
-  DenseSet<const MachineInstr*> SafeToRemoveInsts;
-  DenseSet<const MachineInstr*> DivergentInsts;
+  DenseSet<const MachineInstr *> TotalUniformInsts;
+  DenseSet<const MachineInstr *> SafeToRemoveInsts;
+  DenseSet<const MachineInstr *> DivergentInsts;
   void RemoveInst(const MachineInstr *MI) {
     TotalUniformInsts.erase(MI);
     SafeToRemoveInsts.erase(MI);
@@ -96,9 +101,8 @@ typedef AMDGPUHotBlockRematerialize Remat;
 // Util functions.
 namespace {
 
-MachineBasicBlock *
-nearest_common_dominator(MachineDominatorTree *DT,
-                         BlockSet &Blocks) {
+MachineBasicBlock *nearest_common_dominator(MachineDominatorTree *DT,
+                                            BlockSet &Blocks) {
   auto I = Blocks.begin(), E = Blocks.end();
 
   MachineBasicBlock *DomB = cast<MachineBasicBlock>(*(I++));
@@ -214,10 +218,10 @@ bool IsSafeToMove(MachineInstr *DefMI, MachineRegisterInfo &MRI) {
   return true;
 }
 
-
 // SGPR has alignment requirment, cannot get accurate reg number.
 const unsigned NearTargetRegLimit = 10;
-bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST, MachineFunction &MF) {
+bool nearSgprSpill(unsigned maxSPressure, const GCNSubtarget *ST,
+                   MachineFunction &MF) {
   unsigned maxSGPR = ST->getAddressableNumSGPRs();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
@@ -251,10 +255,10 @@ struct RematStatus {
   DenseSet<MachineBasicBlock *> MemWriteMBBSet;
 };
 
-unsigned CollectMBBPressure(
-    MachineBasicBlock &MBB, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
-    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
-    RematStatus &status) {
+unsigned CollectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS,
+                            const MachineRegisterInfo &MRI,
+                            const GCNSubtarget *ST, unsigned &maxVPressure,
+                            unsigned &maxSPressure, RematStatus &status) {
   // Skip processing current block if it has only debug instructions
   if (MBB.getFirstNonDebugInstr() == MBB.end())
     return ST->getOccupancyWithNumVGPRs(0);
@@ -287,10 +291,10 @@ unsigned CollectMBBPressure(
   return RP.getOccupancy(*ST);
 }
 
-unsigned CollectFnPressure(
-    MachineFunction &MF, LiveIntervals *LIS, const MachineRegisterInfo &MRI,
-    const GCNSubtarget *ST, unsigned &maxVPressure, unsigned &maxSPressure,
-    RematStatus &status) {
+unsigned CollectFnPressure(MachineFunction &MF, LiveIntervals *LIS,
+                           const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST, unsigned &maxVPressure,
+                           unsigned &maxSPressure, RematStatus &status) {
   unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second;
   // If only have one block, input/ouput virtual live set are empty.
   if (MF.size() > 1) {
@@ -349,15 +353,13 @@ unsigned CollectFnPressure(
 
   LLVM_DEBUG(
       const SIRegisterInfo *SIRI = ST->getRegisterInfo();
-      dbgs() << "output live"; for (auto &it
-                                    : status.MBBOutputLiveMap) {
+      dbgs() << "output live"; for (auto &it : status.MBBOutputLiveMap) {
         unsigned Idx = it.first->getNumber();
         auto LiveReg = it.second;
         dbgs() << "MBB" << Idx << ":";
         llvm::dumpLiveSet(LiveReg, SIRI);
       } dbgs() << "input live";
-      for (auto &it
-           : status.MBBInputLiveMap) {
+      for (auto &it : status.MBBInputLiveMap) {
         unsigned Idx = it.first->getNumber();
         auto LiveReg = it.second;
         dbgs() << "MBB" << Idx << ":";
@@ -373,14 +375,14 @@ unsigned CollectFnPressure(
   }
   return TgtOcc;
 }
-RematStatus
-GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
-               const MachineRegisterInfo &MRI, const GCNSubtarget *ST) {
+RematStatus GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI,
+                           LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+                           const GCNSubtarget *ST) {
   unsigned maxSPressure = 0;
   unsigned maxVPressure = 0;
   RematStatus status;
-  unsigned TgtOcc = CollectFnPressure(MF, LIS, MRI, ST, maxVPressure,
-                                      maxSPressure, status);
+  unsigned TgtOcc =
+      CollectFnPressure(MF, LIS, MRI, ST, maxVPressure, maxSPressure, status);
   const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second;
   if (TgtOcc >= MaxOcc) {
     status.TargetOcc = TgtOcc;
@@ -415,7 +417,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
       TgtOcc = bigOcc;
       bNotBalance = true;
       if (TgtOcc >= MaxOccupancy)
-        TgtOcc = MaxOccupancy-1;
+        TgtOcc = MaxOccupancy - 1;
     }
   }
 
@@ -433,7 +435,7 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
       vInputPressure += RegSize;
     } else {
       unsigned RegIndex = SIRI->getHWRegIndex(Reg);
-      uint64_t mask = ((1 << RegSize) - 1 ) << RegIndex;
+      uint64_t mask = ((1 << RegSize) - 1) << RegIndex;
       sInputMask |= mask;
     }
   }
@@ -448,7 +450,6 @@ GetRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
     mask = mask << 4;
   }
 
-
   // If balanced, try next occupancy.
   TgtOcc = bNotBalance ? TgtOcc : (TgtOcc + 1);
 
@@ -611,8 +612,7 @@ int GetSharedReducedSize(InstSet &ReducedInsts, bool bVGPR,
 }
 
 int GetReducedSize(MapVector<unsigned, RematNode> &RematMap, bool bVGPR,
-                   GCNRPTracker::LiveRegSet &CanidateSet,
-                   InstSet &ReducedInsts,
+                   GCNRPTracker::LiveRegSet &CanidateSet, InstSet &ReducedInsts,
                    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                    BlockLiveInfo &LiveInfo,
                    DenseMap<MachineBasicBlock *, unsigned> &RPOTIndexMap) {
@@ -788,9 +788,11 @@ void BuildRematCandiates(std::vector<RematNode> &Candidates,
 }
 
 // For case like
-//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, implicit-def dead $scc; xb.uniform
-//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; xb.uniform
-//  %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit killed $scc; xb.uniform
+//   %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0,
+//   implicit-def dead $scc; xb.uniform
+//  S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc;
+//  xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit
+//  killed $scc; xb.uniform
 // Sink S_AND right before S_CSELECT will overwrite SCC.
 // To avoid it, skip case when DefMI and UseMI has implicit define use.
 bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) {
@@ -970,7 +972,7 @@ int FilterRematCandiates(std::vector<RematNode> &Candidates,
 }
 
 void updateUsers(unsigned Reg, unsigned NewReg, bool bSubRegDef,
-                SmallVector<MachineInstr *, 2> &userMIs) {
+                 SmallVector<MachineInstr *, 2> &userMIs) {
   for (MachineInstr *UseMI : userMIs) {
     for (MachineOperand &MO : UseMI->operands()) {
       if (!MO.isReg())
@@ -996,7 +998,6 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
     }
   }
 
-
   // For userBlocks which dominate all hotBlocks, don't need to clone because
   // the value not cross hotBlocks when later blocks are cloned.
   // For userBlocks which dominated by all hotBlocks, they could share clones
@@ -1061,68 +1062,45 @@ DenseMap<MachineBasicBlock *, BlockSet> reduceClonedMBBs(
 // Look for an earlier insert point if the InstructionToMove
 // writes to scc and scc is live at the CurrentInsertPoint.
 static MachineBasicBlock::iterator AdjustInsertPointToAvoidSccSmash(
-    MachineInstr *InstructionToMove,
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator CurrentInsertPoint,
-    MachineRegisterInfo &MRI,
-   const SIRegisterInfo *SIRI,
-   const SIInstrInfo *SIII
-) 
-{
-    const bool WillSmashScc = InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
-    if (WillSmashScc)
-    {
-        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
-            CurrentInsertPoint,
-            SIRI,
-            SIII,
-            &MRI
-        );
-    }
-
-    return CurrentInsertPoint;
+    MachineInstr *InstructionToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  const bool WillSmashScc =
+      InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+  }
+
+  return CurrentInsertPoint;
 }
 
 // Look for an earlier insert point if the SubExp
 // writes to scc and scc is live at the CurrentInsertPoint.
 static MachineBasicBlock::iterator AdjustInsertPointForSubExpToAvoidSccSmash(
-    const SubExp &SubExpToMove,
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator CurrentInsertPoint,
-    MachineRegisterInfo& MRI,
-    const SIRegisterInfo* SIRI,
-    const SIInstrInfo* SIII
-)
-{
-    const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
-    if (WillSmashScc)
-    {
-        CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(MBB,
-            CurrentInsertPoint,
-            SIRI,
-            SIII,
-            &MRI
-        );
-    }
-
-    return CurrentInsertPoint;
+    const SubExp &SubExpToMove, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI,
+    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+  const bool WillSmashScc = SubExpToMove.modifiesRegister(AMDGPU::SCC, SIRI);
+  if (WillSmashScc) {
+    CurrentInsertPoint = llvm::FindOrCreateInsertionPointForSccDef(
+        MBB, CurrentInsertPoint, SIRI, SIII, &MRI);
+  }
+
+  return CurrentInsertPoint;
 }
 
 // Return trun if moving MI to Location will smash a live scc value.
-static bool WillSmashSccAtLocation(
-    MachineInstr* MI,
-    MachineBasicBlock* MBB,
-    MachineBasicBlock::iterator Location
-)
-{
-    // It is ok to pass nullptr to `modifiesRegister` for TRI here since
-    // SCC has no subreg/suprereg relationships.
-    return MI->modifiesRegister(AMDGPU::SCC, nullptr)
-        && llvm::IsSccLiveAt(MBB, Location);
+static bool WillSmashSccAtLocation(MachineInstr *MI, MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator Location) {
+  // It is ok to pass nullptr to `modifiesRegister` for TRI here since
+  // SCC has no subreg/suprereg relationships.
+  return MI->modifiesRegister(AMDGPU::SCC, nullptr) &&
+         llvm::IsSccLiveAt(MBB, Location);
 }
 
-void ApplyCloneRemat(Remat *Remat,
-                     RematNode &Node, std::vector<BlockLiveInfo> &hotBlocks,
+void ApplyCloneRemat(Remat *Remat, RematNode &Node,
+                     std::vector<BlockLiveInfo> &hotBlocks,
                      MachineDominatorTree *pDT, MachineRegisterInfo &MRI,
                      SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, MachineFunction &MF) {
@@ -1182,10 +1160,9 @@ void ApplyCloneRemat(Remat *Remat,
         InsertPointMI = UseMI;
       }
     }
-    
+
     MachineBasicBlock::iterator InsertPoint = AdjustInsertPointToAvoidSccSmash(
-        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII
-    );
+        DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII);
 
     for (MachineMemOperand *MO : DefMI->memoperands()) {
       NewDef->addMemOperand(MF, MO);
@@ -1218,10 +1195,11 @@ void ApplyCloneRemat(Remat *Remat,
 
 void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
                             SlotIndexes *slotIndexes,
-                            const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
+                            const SIRegisterInfo *SIRI,
+                            const SIInstrInfo *SIII) {
   MachineInstr *DefMI = Node.DefMI;
   MachineInstr *InsertPointMI = Node.InsertPointMI;
-  MachineBasicBlock* MBB = nullptr;
+  MachineBasicBlock *MBB = nullptr;
 
   // Find a valid insert point.
   MachineBasicBlock::iterator InsertPoint;
@@ -1233,10 +1211,9 @@ void ApplyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI,
     MBB = Node.InsertBlock;
   }
 
-  InsertPoint = AdjustInsertPointToAvoidSccSmash(
-      DefMI, MBB, InsertPoint, MRI, SIRI, SIII
-  );
-  
+  InsertPoint = AdjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI,
+                                                 SIRI, SIII);
+
   // Move instruction to new location.
   DefMI->removeFromParent();
   InsertPoint->getParent()->insert(InsertPoint, DefMI);
@@ -1268,7 +1245,8 @@ void ApplyRemat(Remat *Remat, MapVector<unsigned, RematNode> &RematMap,
     if (Node.Kind == RematNode::RematKind::OneDefOneUse) {
       ApplyOneDefOneUseRemat(Node, MRI, slotIndexes, SIRI, SIII);
     } else if (Node.Kind == RematNode::RematKind::Clone) {
-      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII, MF);
+      ApplyCloneRemat(Remat, Node, hotBlocks, pDT, MRI, slotIndexes, SIRI, SIII,
+                      MF);
     }
   }
 }
@@ -1502,7 +1480,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
               MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg);
               if (UseMI.getParent() != MBB)
                 continue;
-              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI, /*bVGPR*/false);
+              int gain = RematGain(&MI, Reg, CandidateRegs, MRI, SIRI,
+                                   /*bVGPR*/ false);
               if (gain > 0) {
                 // Skip case when DefMI has implicit define which used by UseMI.
                 if (isImplicitDefUse(&MI, &UseMI)) {
@@ -1536,8 +1515,7 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
     bool bNeedVRemat = rematVCnt > 0;
     // If sgpr spill, always do remat.
     bool bSRematOK =
-        (newRematSCnt <= 0 && !SRematMap.empty()) ||
-        bForceRematSgpr;
+        (newRematSCnt <= 0 && !SRematMap.empty()) || bForceRematSgpr;
     bool bVRematOK =
         (status.bNotBalance || newRematVCnt <= 0) && !VRematMap.empty();
     if (bNeedSRemat && bNeedVRemat) {
@@ -1572,7 +1550,8 @@ bool hotBlockRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
 
   if (!SRematMap.empty()) {
     bUpdated = true;
-    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII, MF);
+    ApplyRemat(Remat, SRematMap, hotBlocks, pDT, SlotIndexes, MRI, SIRI, SIII,
+               MF);
     LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs()););
   }
 
@@ -1592,49 +1571,46 @@ bool isPhyRegUniqueDef(unsigned Reg, const MachineRegisterInfo &MRI) {
   return DefMIs.size() == 1;
 }
 
-static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg)
-{
-    if (!MO.isImplicit() || !MO.isUse() || !MO.isReg())
-    {
-        return false;
-    }
+static bool IsImplicitUseOfReg(const MachineOperand &MO, unsigned Reg) {
+  if (!MO.isImplicit() || !MO.isUse() || !MO.isReg()) {
+    return false;
+  }
 
-    return MO.getReg() == Reg;
+  return MO.getReg() == Reg;
 }
 
-static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg)
-{
-    if (!MO.isImplicit() || !MO.isDef() || !MO.isReg())
-    {
-        return false;
-    }
+static bool IsImplicitDefOfReg(const MachineOperand &MO, unsigned Reg) {
+  if (!MO.isImplicit() || !MO.isDef() || !MO.isReg()) {
+    return false;
+  }
 
-    return MO.getReg() == Reg;
+  return MO.getReg() == Reg;
 }
 
-static bool IsSafeRematCandidateUser(const MachineInstr *UseMI, const SIInstrInfo *SIII)
-{
-    // Make sure UseMI is not wqm like sample.
-    if (SIII->isWQM(UseMI->getOpcode()))
-        return false;
-    if (UseMI->getOpcode() == AMDGPU::PHI)
-        return false;
-    
-    return true;
+static bool IsSafeRematCandidateUser(const MachineInstr *UseMI,
+                                     const SIInstrInfo *SIII) {
+  // Make sure UseMI is not wqm like sample.
+  if (SIII->isWQM(UseMI->getOpcode()))
+    return false;
+  if (UseMI->getOpcode() == AMDGPU::PHI)
+    return false;
+
+  return true;
 }
 
 static bool isConvergent(Remat *Remat, const MachineInstr &MI) {
   return MI.isConvergent() &&
-    // This flag is set on readfirstlane's to indicate that they
-    // are redundant (the value being read is already uniform).
-    // Normally, readfirstlanes are convergent, because different exec
-    // will cause a different value to be read; a known uniform
-    // readfirstlane is safe to move or clone and not actually convergent.
-    !Remat->TotalUniformInsts.count(&MI);
+         // This flag is set on readfirstlane's to indicate that they
+         // are redundant (the value being read is already uniform).
+         // Normally, readfirstlanes are convergent, because different exec
+         // will cause a different value to be read; a known uniform
+         // readfirstlane is safe to move or clone and not actually convergent.
+         !Remat->TotalUniformInsts.count(&MI);
 }
 
 bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
-                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, bool bSink) {
+                     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+                     bool bSink) {
   if (Reg.isPhysical())
     return false;
   bool bVGPR = SIRI->isVGPR(MRI, Reg);
@@ -1661,7 +1637,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
     if (!Op.isReg())
       continue;
     Register OpReg = Op.getReg();
-    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) || IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
+    if (IsImplicitUseOfReg(Op, AMDGPU::EXEC) ||
+        IsImplicitUseOfReg(Op, AMDGPU::EXEC_LO))
       continue;
     if (IsImplicitUseOfReg(Op, AMDGPU::MODE))
       continue;
@@ -1672,7 +1649,8 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
       continue;
     if (OpReg.isPhysical())
       return false;
-    if (!MRI.getUniqueVRegDef(OpReg) && !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
+    if (!MRI.getUniqueVRegDef(OpReg) &&
+        !llvm::IsSub0Sub1SingleDef(OpReg, MRI)) {
       return false;
     }
   }
@@ -1693,12 +1671,10 @@ bool isSafeCandidate(Remat *Remat, Register Reg, const MachineRegisterInfo &MRI,
 }
 
 std::vector<SubExp> buildSubExpFromCandidates(
-    Remat *Remat,
-    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes,
-    GCNRPTracker::LiveRegSet &unUsedPassThrus,
-    bool bAllowPartialUseInSubExp) {
+    GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
   InstSet CandidateDefs;
   DenseSet<unsigned> RemovedCandidates;
   std::vector<unsigned> CandidateRegs;
@@ -1795,7 +1771,7 @@ std::vector<SubExp> buildSubExpFromCandidates(
       break;
     }
 
-    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/true))
+    if (!isSafeCandidate(Remat, Reg, MRI, SIRI, SIII, /*bSink*/ true))
       continue;
 
     // If all users of MI are in candidate defs, add MI into candidate defs.
@@ -1852,10 +1828,9 @@ std::vector<SubExp> buildSubExpFromCandidates(
     defs.emplace_back(pMI);
   }
 
-  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : defs) {
-    MI->dump();
-  } dbgs() << "\nFinished Candidate Defs End\n";);
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+             for (MachineInstr *MI : defs) { MI->dump(); } dbgs()
+             << "\nFinished Candidate Defs End\n";);
 
   // Build SubExp with CandidateDefs as Nodes, CandidateInput as input
   // Candidates as output.
@@ -1874,10 +1849,8 @@ std::vector<SubExp> buildSubExpFromCandidates(
   return dag.SubExps;
 }
 
-
 std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
-    Remat* Remat,
-    GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
+    Remat *Remat, GCNRPTracker::LiveRegSet &Candidates, MachineBasicBlock *MBB,
     const SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
     const MachineRegisterInfo &MRI, SlotIndexes *slotIndexes) {
   InstSet CandidateDefs;
@@ -2043,13 +2016,11 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
     defs.emplace_back(pMI);
   }
 
-  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n"; for (MachineInstr *MI
-                                                            : defs) {
-    MI->dump();
-  } dbgs() << "\nFinished Candidate Defs End\n";);
+  LLVM_DEBUG(dbgs() << "\nFinished Candidate Defs:\n";
+             for (MachineInstr *MI : defs) { MI->dump(); } dbgs()
+             << "\nFinished Candidate Defs End\n";);
 
-  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it
-                                                            : LocalCandidates) {
+  LLVM_DEBUG(dbgs() << "\nLocalCandidates:\n"; for (auto it : LocalCandidates) {
     pressure::print_reg(it.first, MRI, SIRI, llvm::dbgs());
   } dbgs() << "\nLocalCandidates End\n";);
   // Make sure all input reg are uniqueDef.
@@ -2061,7 +2032,6 @@ std::vector<SubExp> buildSubExpFromCandidatesTopBottom(
   return dag.SubExps;
 }
 
-
 void print_vreg(Register Reg, const MachineRegisterInfo &MRI) {
   if (Reg.isVirtual()) {
     StringRef Name = MRI.getVRegName(Reg);
@@ -2099,8 +2069,7 @@ MachineBasicBlock *FindTargetBlock(unsigned Reg, MachineBasicBlock *FromBB,
 
 void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
                              MachineDominatorTree *pDT,
-                             SlotIndexes *slotIndexes,
-                             const SIInstrInfo *SIII,
+                             SlotIndexes *slotIndexes, const SIInstrInfo *SIII,
                              const SIRegisterInfo *SIRI) {
   // Move from bottom.
   MachineBasicBlock *FromBB = Exp.FromBB;
@@ -2115,12 +2084,14 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
       continue;
 
     // Do not overwrite a live scc.
-    MachineBasicBlock::iterator InsertPoint = ToBB->SkipPHIsAndLabels(ToBB->begin());
+    MachineBasicBlock::iterator InsertPoint =
+        ToBB->SkipPHIsAndLabels(ToBB->begin());
     if (WillSmashSccAtLocation(DefMI, ToBB, InsertPoint))
       continue;
 
     DefMI->removeFromParent();
-    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) && "invalid insert point");
+    assert(!llvm::isExecUpdateForControlFlow(*InsertPoint) &&
+           "invalid insert point");
     ToBB->insert(InsertPoint, DefMI);
     // Debug insts don't need slot index.
     if (DefMI->isDebugInstr())
@@ -2131,12 +2102,11 @@ void ApplySubExpMoveNearUser(SubExp &Exp, const MachineRegisterInfo &MRI,
   }
 }
 
-
 void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
-                             MachineDominatorTree *pDT,
-                             SlotIndexes *slotIndexes,
-                             const SIInstrInfo *SIII,
-                             const SIRegisterInfo *SIRI) {
+                               MachineDominatorTree *pDT,
+                               SlotIndexes *slotIndexes,
+                               const SIInstrInfo *SIII,
+                               const SIRegisterInfo *SIRI) {
   // Move from top.
   // Find lowest input def.
   MachineBasicBlock *ToBB = Exp.ToBB;
@@ -2152,9 +2122,8 @@ void ApplySubExpMoveNearDefine(SubExp &Exp, MachineRegisterInfo &MRI,
       Terminator = ToBB->end();
   }
 
-  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(
-      Exp, ToBB, Terminator, MRI, SIRI, SIII
-  );
+  Terminator = AdjustInsertPointForSubExpToAvoidSccSmash(Exp, ToBB, Terminator,
+                                                         MRI, SIRI, SIII);
 
   for (auto it = Exp.SUnits.begin(); it != Exp.SUnits.end(); it++) {
     MachineInstr *DefMI = *it;
@@ -2388,11 +2357,12 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
       reduceClonedMBBs(Exp, userBlocks, userBlocksLiveRegs, hotBlocks, pDT);
 
   // Sort to make stable order.
-  std::sort(userBlocks.begin(), userBlocks.end(),
-    [](std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it0,
-      std::pair<MachineBasicBlock*, SmallVector<MachineInstr*, 2>>& it1) {
+  std::sort(
+      userBlocks.begin(), userBlocks.end(),
+      [](std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it0,
+         std::pair<MachineBasicBlock *, SmallVector<MachineInstr *, 2>> &it1) {
         return it0.first->getNumber() < it1.first->getNumber();
-    });
+      });
 
   const bool bModifiesScc = Exp.modifiesRegister(AMDGPU::SCC, SIRI);
 
@@ -2481,7 +2451,6 @@ void ApplySubExpCloneNearUser(SubExp &Exp, std::vector<HotBlock> &hotBlocks,
   }
 }
 
-
 void ApplySubExpCloneNearUserInBlock(
     SubExp &Exp,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
@@ -2620,7 +2589,7 @@ unsigned getPacifistLevel(unsigned Reg,
 }
 
 bool hasInBlockDef(unsigned Reg, MachineBasicBlock *MBB,
-                                  const MachineRegisterInfo &MRI) {
+                   const MachineRegisterInfo &MRI) {
   for (MachineInstr &def : MRI.def_instructions(Reg)) {
     if (def.getParent() != MBB)
       continue;
@@ -2655,8 +2624,8 @@ bool isPassThru(unsigned Reg, const GCNRPTracker::LiveRegSet &inputLive,
   return inputLive.count(Reg) && outputLive.count(Reg);
 }
 
-// Instructions which only use imm/passThru reg/output only reg will not kill any
-// live reg, so name them pacifist here.
+// Instructions which only use imm/passThru reg/output only reg will not kill
+// any live reg, so name them pacifist here.
 bool collectPacifist(MachineInstr &MI,
                      const GCNRPTracker::LiveRegSet &inputLive,
                      const GCNRPTracker::LiveRegSet &outputLive,
@@ -2699,7 +2668,8 @@ bool collectPacifist(MachineInstr &MI,
     if (Reg.isPhysical())
       return false;
 
-    if (nullptr == getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
+    if (nullptr ==
+        getInBlockUniqueDef(Reg, MI.getParent(), inputLive, outputLive, MRI))
       return false;
 
     bHasDef = true;
@@ -2708,30 +2678,27 @@ bool collectPacifist(MachineInstr &MI,
   return bHasDef;
 }
 
-static MachineInstr* findFirstAliasingLoadOrStoreInMBB(
-    MachineInstr &MI,
-    MachineBasicBlock &MBB,
-    AliasAnalysis *AA
-)
-{
-    if (MI.mayLoadOrStore())
-    {
-        for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end(); I != E; ++I)
-        {
-            const bool UseTBAA = false;
-            if (MI.mayAlias(AA, *I, UseTBAA))
-            {
-                return &*I;
-            }
-        }
+static MachineInstr *findFirstAliasingLoadOrStoreInMBB(MachineInstr &MI,
+                                                       MachineBasicBlock &MBB,
+                                                       AliasAnalysis *AA) {
+  if (MI.mayLoadOrStore()) {
+    for (MachineBasicBlock::iterator I = MI.getIterator(), E = MBB.end();
+         I != E; ++I) {
+      const bool UseTBAA = false;
+      if (MI.mayAlias(AA, *I, UseTBAA)) {
+        return &*I;
+      }
     }
+  }
 
-    return nullptr;
+  return nullptr;
 }
 
-static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
-                            AliasAnalysis *AA,
-                            SlotIndexes *slotIndexes) {
+static MachineInstr *findPacifistInsertPoint(MachineInstr &MI,
+                                             MachineBasicBlock &MBB,
+                                             MachineRegisterInfo &MRI,
+                                             AliasAnalysis *AA,
+                                             SlotIndexes *slotIndexes) {
 
   SmallVector<MachineInstr *, 2> users;
 
@@ -2739,14 +2706,13 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
   // op with which it aliases. Find the first instruction
   // that aliases the pacifist MI (if any) and add it to the list
   // of users. The sort() below will select the earliest user instruction.
-  if (MachineInstr* AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
+  if (MachineInstr *AliasMI = findFirstAliasingLoadOrStoreInMBB(MI, MBB, AA)) {
     users.push_back(AliasMI);
   }
 
   for (MachineOperand &MO : MI.defs()) {
     unsigned Reg = MO.getReg();
-    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg))
-    {
+    for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) {
       if (&MBB != UseMI.getParent())
         continue;
       users.emplace_back(&UseMI);
@@ -2770,8 +2736,7 @@ static MachineInstr *findPacifistInsertPoint(MachineInstr &MI, MachineBasicBlock
 bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
                      MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                      const SIInstrInfo *SIII, AliasAnalysis *AA,
-                     RematStatus &status) 
-{
+                     RematStatus &status) {
   const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
   const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
 
@@ -2792,7 +2757,8 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
 
   // Move pacifist to its first user.
   for (MachineInstr *MI : pacifistList) {
-    MachineInstr *firstUser = findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
+    MachineInstr *firstUser =
+        findPacifistInsertPoint(*MI, MBB, MRI, AA, slotIndexes);
     if (firstUser == MI)
       continue;
     if (firstUser == MI->getNextNode())
@@ -2809,14 +2775,15 @@ bool tryHoldPacifist(MachineBasicBlock &MBB, LiveIntervals *LIS,
         // BRANCH may have exec update before it.
         insertPoint--;
 
-      insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+      insertPoint =
+          llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
 
       while ((insertPoint->definesRegister(AMDGPU::EXEC, SIRI) ||
               insertPoint->definesRegister(AMDGPU::EXEC_LO, SIRI)) &&
-             insertPoint != MI->getIterator())
-      {
+             insertPoint != MI->getIterator()) {
         insertPoint--;
-        insertPoint = llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
+        insertPoint =
+            llvm::skipDebugInstructionsBackward(insertPoint, MBB.instr_begin());
       }
       if (insertPoint == MI->getIterator())
         continue;
@@ -2882,7 +2849,7 @@ bool collectVToSCrossHotSpot(
     const SIInstrInfo *SIII) {
   unsigned VLimit = status.TargetVLimit;
   unsigned SLimit = status.TargetSLimit;
-  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
 
   GCNDownwardRPTracker Tracker(*LIS);
 
@@ -2921,24 +2888,23 @@ bool collectVToSCrossHotSpot(
       VExtra--;
       bUpdated = true;
     }
-
   }
   return bUpdated;
 }
 
 // Return true if the user is outside of the def's loop.
-static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User, MachineLoopInfo *MLI)
-{
-  MachineLoop* L = MLI->getLoopFor(Def->getParent());
+static bool IsCrossLoopUse(MachineInstr *Def, MachineInstr *User,
+                           MachineLoopInfo *MLI) {
+  MachineLoop *L = MLI->getLoopFor(Def->getParent());
   return L && !L->contains(User->getParent());
 }
 
 bool rematUniformVgprToSgpr(
-    Remat *Remat,
-    MachineFunction &MF, RematStatus &status,
+    Remat *Remat, MachineFunction &MF, RematStatus &status,
     DenseMap<MachineBasicBlock *, GCNRegPressure> &MBBPressureMap,
-    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS, MachineRegisterInfo &MRI,
-    const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
+    std::vector<HotBlock> &hotBlocks, LiveIntervals *LIS,
+    MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, MachineLoopInfo *MLI) {
   DenseMap<unsigned, MachineInstr *> UniformVgprMap =
       collectUniformVgprs(Remat, MF, MRI, SIRI);
 
@@ -2972,7 +2938,8 @@ bool rematUniformVgprToSgpr(
       // Do not replace v->s across loops. Even if the value is uniform
       // branch divergence can cause a uniform value in a loop to be
       // non-uniform when used outside a loop.
-      if (IsSafeRematCandidateUser(&userMI, SIII) && !IsCrossLoopUse(MI, &userMI, MLI))
+      if (IsSafeRematCandidateUser(&userMI, SIII) &&
+          !IsCrossLoopUse(MI, &userMI, MLI))
         userMIs.emplace_back(&userMI);
     }
 
@@ -2988,7 +2955,7 @@ bool rematUniformVgprToSgpr(
     for (MachineInstr *userMI : userMIs) {
       const auto &Desc = userMI->getDesc();
       bool bIllegal = false;
-      for (unsigned i=0;i<userMI->getNumOperands();i++) {
+      for (unsigned i = 0; i < userMI->getNumOperands(); i++) {
         MachineOperand &MO = userMI->getOperand(i);
         if (!MO.isReg())
           continue;
@@ -3021,7 +2988,8 @@ bool rematUniformVgprToSgpr(
       auto rit = userMI->getReverseIterator();
       rit++;
       auto endIt = userMI->getParent()->rend();
-      while (rit != endIt && !rit->isDebugInstr() && !slotIndexes->hasIndex(*rit))
+      while (rit != endIt && !rit->isDebugInstr() &&
+             !slotIndexes->hasIndex(*rit))
         slotIndexes->insertMachineInstrInMaps(*(rit++));
     }
   }
@@ -3107,9 +3075,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
               DenseSet<MachineInstr *> &hotSet, int vDistance, int sDistance,
               unsigned VLimit, unsigned SLimit,
               const DenseSet<MachineBasicBlock *> &MemWriteMBBSet,
-              LiveIntervals *LIS,
-              const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-              const SIInstrInfo *SIII) {
+              LiveIntervals *LIS, const MachineRegisterInfo &MRI,
+              const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
   auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
   const auto &SI = LIS->getInstructionIndex(*hotMI).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, *LIS, MRI);
@@ -3134,7 +3101,8 @@ bool tryRemat(MachineBasicBlock &MBB, MachineInstr *hotMI,
       continue;
 
     // Igonre inst in hot range.
-    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit || RP.getMaxSGPR() > SLimit) {
+    if (RP.getVGPRNum(ST.hasGFX90AInsts()) > VLimit ||
+        RP.getMaxSGPR() > SLimit) {
       Tracker.advance();
       continue;
     }
@@ -3249,7 +3217,7 @@ bool tryRematInHotSpot(
   unsigned VLimit = status.TargetVLimit;
   unsigned SLimit = status.TargetSLimit;
 
-  auto& ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  auto &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
   const GCNRPTracker::LiveRegSet inputLive = status.MBBInputLiveMap[&MBB];
 
   const GCNRPTracker::LiveRegSet outputLive = status.MBBOutputLiveMap[&MBB];
@@ -3300,9 +3268,8 @@ bool tryRematInHotSpot(
     // Use hotVMI when apply.
     inBlockHotSInstMap[&MBB] = nullptr;
     if (tryRemat(MBB, hotVMI, inBlockCloneSubExps, /*bVGPR*/ true, inputLive,
-                    outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
-                    status.MemWriteMBBSet,
-                    LIS, MRI, SIRI, SIII))
+                 outputLive, hotSet, vDistance, sDistance, VLimit, SLimit,
+                 status.MemWriteMBBSet, LIS, MRI, SIRI, SIII))
       return true;
   }
 
@@ -3312,8 +3279,7 @@ bool tryRematInHotSpot(
     inBlockHotVInstMap[&MBB] = nullptr;
     return tryRemat(MBB, hotSMI, inBlockCloneSubExps, /*bVGPR*/ false,
                     inputLive, outputLive, hotSet, vDistance, sDistance, VLimit,
-                    SLimit, status.MemWriteMBBSet,
-                    LIS, MRI, SIRI, SIII);
+                    SLimit, status.MemWriteMBBSet, LIS, MRI, SIRI, SIII);
   }
   return false;
 }
@@ -3444,7 +3410,8 @@ void sortSubExpCandidates(std::vector<SubExp> &subExpCandidates) {
   }
 }
 
-// Compare pressure, return ture if maxV0/maxS0 pressure is higher than maxV1/maxS1.
+// Compare pressure, return ture if maxV0/maxS0 pressure is higher than
+// maxV1/maxS1.
 bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
                     unsigned maxS1, const GCNSubtarget *ST) {
   unsigned VTgtOcc0 = ST->getOccupancyWithNumVGPRs(maxV0);
@@ -3467,10 +3434,11 @@ bool pressureHigher(unsigned maxV0, unsigned maxS0, unsigned maxV1,
 }
 
 // Return true if the subExp can help pressure for passThrus.
-bool canHelpPressureWhenSink(SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
-                     const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
-                     const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
-                     MachineDominatorTree *pDT, bool bCanClone,bool bSgprBound) {
+bool canHelpPressureWhenSink(
+    SubExp &subExp, const GCNRPTracker::LiveRegSet &passThrus,
+    const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
+    const SIInstrInfo *SIII, const MachineLoopInfo *MLI,
+    MachineDominatorTree *pDT, bool bCanClone, bool bSgprBound) {
   LLVM_DEBUG(subExp.dump(MRI, SIRI));
   if (!subExp.isSafeToMove(MRI, /*bMoveUp*/ false))
     return false;
@@ -3586,8 +3554,7 @@ bool canHelpPressureWhenHoist(SubExp &subExp, const MachineRegisterInfo &MRI,
 }
 
 SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-groupPassThruByDefBlock(Remat *Remat,
-                        const GCNRPTracker::LiveRegSet &passThrus,
+groupPassThruByDefBlock(Remat *Remat, const GCNRPTracker::LiveRegSet &passThrus,
                         GCNRPTracker::LiveRegSet &usedPassThrus,
                         MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI,
                         const SIInstrInfo *SIII) {
@@ -3613,16 +3580,17 @@ groupPassThruByDefBlock(Remat *Remat,
     GCNRPTracker::LiveRegSet &DefInMBB = Candidates[DefMI->getParent()];
     DefInMBB[Reg] = it.second;
   }
-  
-  llvm::SmallVector<std::pair<MachineBasicBlock*, GCNRPTracker::LiveRegSet>> result = Candidates.takeVector();
 
-  LLVM_DEBUG(llvm::dbgs() << "Before sort candidates\n"; for (auto it
-                                                              : result) {
-    MachineBasicBlock *MBB = it.first;
-    auto &defInMBB = it.second;
-    MBB->dump();
-    llvm::dumpLiveSet(defInMBB, SIRI);
-  } llvm::dbgs() << "end of candidates\n";);
+  llvm::SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
+      result = Candidates.takeVector();
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "Before sort candidates\n"; for (auto it : result) {
+        MachineBasicBlock *MBB = it.first;
+        auto &defInMBB = it.second;
+        MBB->dump();
+        llvm::dumpLiveSet(defInMBB, SIRI);
+      } llvm::dbgs() << "end of candidates\n";);
 
   std::sort(result.begin(), result.end(),
             [](std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet> &it0,
@@ -3630,8 +3598,7 @@ groupPassThruByDefBlock(Remat *Remat,
               return it0.first->getNumber() < it1.first->getNumber();
             });
 
-  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it
-                                                              : result) {
+  LLVM_DEBUG(llvm::dbgs() << "After sort candidates\n"; for (auto it : result) {
     MachineBasicBlock *MBB = it.first;
     auto &defInMBB = it.second;
     MBB->dump();
@@ -3688,7 +3655,8 @@ collectPassThrus(MachineBasicBlock *MBB,
   return passThrus;
 }
 // Try to build a free subExp which all input is passThrus.
-SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp, GCNRPTracker::LiveRegSet &passThrus,
+SubExp buildFreeSubExp(Remat *Remat, SubExp &subExp,
+                       GCNRPTracker::LiveRegSet &passThrus,
                        MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
   SubExp freeExp;
   // Try to split the subExp to find a help case.
@@ -3813,9 +3781,9 @@ std::vector<SubExp> buildSubExpCandidates(
     // Try to remove out reg def sub exp from DefMBB.
     GCNRPTracker::LiveRegSet &DefInMBB = it.second;
     // Go up on the dag until reach share node.
-    auto subExps =
-        buildSubExpFromCandidates(Remat, DefInMBB, DefMBB, SIRI, SIII, MRI,
-                                  slotIndexes, unUsedPassThrus, bAllowPartialUseInSubExp);
+    auto subExps = buildSubExpFromCandidates(
+        Remat, DefInMBB, DefMBB, SIRI, SIII, MRI, slotIndexes, unUsedPassThrus,
+        bAllowPartialUseInSubExp);
     for (SubExp &subExp : subExps) {
       if (subExp.bHasMemInst) {
         // Skip when memory ld/st inst need to cross MBB which write memory.
@@ -3842,11 +3810,13 @@ std::vector<SubExp> buildSubExpCandidates(
         }
       }
       if (!canHelpPressureWhenSink(subExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
-                           bCanClone, bSgprBound)) {
-        if (bAllowPartialUseInSubExp && subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
-          SubExp freeSubExp = buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
-          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII, MLI, pDT,
-                              bCanClone, bSgprBound)) {
+                                   bCanClone, bSgprBound)) {
+        if (bAllowPartialUseInSubExp &&
+            subExp.isSafeToMove(MRI, /*bMoveUp*/ false)) {
+          SubExp freeSubExp =
+              buildFreeSubExp(Remat, subExp, passThrus, MRI, SIRI);
+          if (canHelpPressureWhenSink(freeSubExp, passThrus, MRI, SIRI, SIII,
+                                      MLI, pDT, bCanClone, bSgprBound)) {
             subExpCandidates.emplace_back(freeSubExp);
           }
         }
@@ -3931,8 +3901,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
           LLVM_DEBUG(std::string movStr =
                          Exp.bHoist ? "output hoist:" : "output sink:";
-                     dbgs() << movStr << Register::virtReg2Index(Reg)
-                            << " " << Size);
+                     dbgs()
+                     << movStr << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB when sink and increase when hoist.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -3969,10 +3939,9 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
 
-          LLVM_DEBUG(std::string movStr =
-                         Exp.bHoist ? "input hoist:" : "input sink:";
-                     dbgs() << movStr << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(
+              std::string movStr = Exp.bHoist ? "input hoist:" : "input sink:";
+              dbgs() << movStr << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             if (Exp.bHoist)
@@ -4014,8 +3983,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
         LaneBitmask profitMask = outMask & MBBBeginMask;
         if (MBBBeginMask.any()) {
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
-          LLVM_DEBUG(dbgs() << "move:" << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(dbgs()
+                     << "move:" << Register::virtReg2Index(Reg) << " " << Size);
           // Exp out live at block input.
           // It will descrease live for MBB.
           if (SIRI->isVGPR(MRI, Reg)) {
@@ -4043,8 +4012,8 @@ calculateSaving(HotBlock &hotBB, std::vector<SubExp> &subExpCandidates,
           // It will increase live for MBB.
           unsigned Size = getRegSize(Reg, profitMask, MRI, SIRI);
 
-          LLVM_DEBUG(dbgs() << "add:" << Register::virtReg2Index(Reg)
-                            << " " << Size);
+          LLVM_DEBUG(dbgs()
+                     << "add:" << Register::virtReg2Index(Reg) << " " << Size);
           if (SIRI->isVGPR(MRI, Reg)) {
             LLVM_DEBUG(dbgs() << "v\n");
             vgprDiff += Size;
@@ -4090,8 +4059,8 @@ void addExpCandidates(std::vector<SubExp> &subExpCandidates,
 }
 
 bool tryToAddSubExps(
-    Remat *Remat,
-    HotBlock &hotBB, RematStatus &status, std::vector<SubExp> &subExpCandidates,
+    Remat *Remat, HotBlock &hotBB, RematStatus &status,
+    std::vector<SubExp> &subExpCandidates,
     std::vector<SubExp> &inBlockCloneSubExps,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotVInstMap,
     DenseMap<MachineBasicBlock *, MachineInstr *> &inBlockHotSInstMap,
@@ -4105,9 +4074,9 @@ bool tryToAddSubExps(
     SlotIndexes *slotIndexes, LiveIntervals *LIS, MachineDominatorTree *pDT,
     bool bCanClone, bool bVOutBound, bool bSOutBound,
     GCNRPTracker::LiveRegSet &unUsedPassThrus, bool bAllowPartialUseInSubExp) {
-  std::vector<SubExp> partialSubExps = buildSubExpCandidates(Remat,
-      Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT, bCanClone,
-      bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
+  std::vector<SubExp> partialSubExps = buildSubExpCandidates(
+      Remat, Candidates, passThrus, MRI, SIRI, SIII, MLI, slotIndexes, pDT,
+      bCanClone, bSOutBound, unUsedPassThrus, status.MemWriteMBBSet,
       bAllowPartialUseInSubExp);
 
   GCNRPTracker::LiveRegSet tmpSavingInputLive = savingInputLive;
@@ -4177,8 +4146,8 @@ bool tryToAddSubExps(
       // Try to remove out reg def sub exp from DefMBB.
       GCNRPTracker::LiveRegSet &UseInMBB = it.second;
       // Go up on the dag until reach share node.
-      auto subExps = buildSubExpFromCandidatesTopBottom(Remat, UseInMBB, UseMBB, SIRI,
-                                                        SIII, MRI, slotIndexes);
+      auto subExps = buildSubExpFromCandidatesTopBottom(
+          Remat, UseInMBB, UseMBB, SIRI, SIII, MRI, slotIndexes);
       for (SubExp &subExp : subExps) {
         if (!canHelpPressureWhenHoist(subExp, MRI, SIRI, SIII, MLI, bSOutBound))
           continue;
@@ -4211,8 +4180,7 @@ bool tryToAddSubExps(
   if (EnableVmemDegree &&
       // Only expect vmem when last tryToAddSubExps.
       // If not, bAllowPartialUseInSubExp will no chance to be true.
-      (bAllowPartialUseInSubExp ||
-       !EnableSubExpAggressive)) {
+      (bAllowPartialUseInSubExp || !EnableSubExpAggressive)) {
     // Assume vmemLdSize could be optimized by not parallel.
     if (((vgpr - hotBB.vmemLdInputSize) <= VLimit ||
          (vgpr - hotBB.vmemLdOutputSize) <= VLimit) &&
@@ -4251,8 +4219,7 @@ bool tryToAddSubExps(
 // Reason to do it per block is to make sure passthru reuse is precise.
 // If try remat on all hot blocks together, the passthru might be on one block,
 // but the reuse in on another block which the reg is not passthru there.
-bool perBlockPassthruRemat(Remat *Remat,
-                           std::vector<HotBlock> &hotBlocks,
+bool perBlockPassthruRemat(Remat *Remat, std::vector<HotBlock> &hotBlocks,
                            RematStatus &status,
                            GCNRPTracker::LiveRegSet &liveRegCandidates,
                            const GCNSubtarget *ST, LiveIntervals *LIS,
@@ -4261,8 +4228,7 @@ bool perBlockPassthruRemat(Remat *Remat,
                            const SIRegisterInfo *SIRI,
                            const SIInstrInfo *SIII) {
   bool bUpdated = false;
-  bool bCanClone = EnableSubExpClone |
-                   EnableSubExpAggressive;
+  bool bCanClone = EnableSubExpClone | EnableSubExpAggressive;
 
   SlotIndexes *slotIndexes = LIS->getSlotIndexes();
   // Sort hot blocks by pressure first.
@@ -4326,19 +4292,19 @@ bool perBlockPassthruRemat(Remat *Remat,
 
     // Group pass thru regs by def MBB.
     SmallVector<std::pair<MachineBasicBlock *, GCNRPTracker::LiveRegSet>>
-        Candidates =
-        groupPassThruByDefBlock(Remat, passThrus, usedPassThrus, MRI, SIRI, SIII);
+        Candidates = groupPassThruByDefBlock(Remat, passThrus, usedPassThrus,
+                                             MRI, SIRI, SIII);
     // unUsedPassThrus used to collect passThru which is skipped when build
     // subExp.
     GCNRPTracker::LiveRegSet unusedPassThrus;
     // Build exp dag on define blocks.
     bool bAllowPartialUseInSubExp = false;
-    if (tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-                        inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
-                        vgpr, sgpr, savingInputLive, savingOutputLive,
-                        passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
-                        LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-                        unusedPassThrus, bAllowPartialUseInSubExp)) {
+    if (tryToAddSubExps(
+            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+            unusedPassThrus, bAllowPartialUseInSubExp)) {
       // Remove unusedPassThrus from passThrus first.
       llvm::andNotLiveRegSet(passThrus, unusedPassThrus);
       llvm::mergeLiveRegSet(usedPassThrus, passThrus);
@@ -4354,12 +4320,12 @@ bool perBlockPassthruRemat(Remat *Remat,
       return false;
 
     bAllowPartialUseInSubExp = true;
-    if (!tryToAddSubExps(Remat, it, status, subExpCandidates, inBlockCloneSubExps,
-                         inBlockHotVInstMap, inBlockHotSInstMap, Candidates,
-                         vgpr, sgpr, savingInputLive, savingOutputLive,
-                         passThrus, usedRegs, MRI, SIRI, SIII, MLI, slotIndexes,
-                         LIS, pDT, bCanClone, bVOutBound, bSOutBound,
-                         unusedPassThrus, bAllowPartialUseInSubExp)) {
+    if (!tryToAddSubExps(
+            Remat, it, status, subExpCandidates, inBlockCloneSubExps,
+            inBlockHotVInstMap, inBlockHotSInstMap, Candidates, vgpr, sgpr,
+            savingInputLive, savingOutputLive, passThrus, usedRegs, MRI, SIRI,
+            SIII, MLI, slotIndexes, LIS, pDT, bCanClone, bVOutBound, bSOutBound,
+            unusedPassThrus, bAllowPartialUseInSubExp)) {
       return false;
     }
     // Just merge all passThrus after tryToAddSubExps allow partialUseInSubExp.
@@ -4425,10 +4391,9 @@ int getVMemLdSize(MachineBasicBlock &MBB, const SIInstrInfo *SIII,
 
 } // namespace
 
-bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveIntervals *LIS,
-                MachineDominatorTree *pDT, MachinePostDominatorTree *pPDT,
-                AliasAnalysis *AA)
-{
+bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI,
+                LiveIntervals *LIS, MachineDominatorTree *pDT,
+                MachinePostDominatorTree *pPDT, AliasAnalysis *AA) {
   if (MF.size() < 2)
     return false;
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
@@ -4490,7 +4455,6 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
                          maxLocalSPressure, status);
 
       maxLocalSPressure += RegForVCC;
-
     }
     if (maxLocalVPressure <= VLimit && maxLocalSPressure <= SLimit)
       continue;
@@ -4499,7 +4463,9 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
     if (bBothOutLimit && maxLocalVPressure <= VLimit)
       continue;
     GCNRPTracker::LiveRegSet liveSet;
-    hotBlocks.push_back({ &MBB, liveSet,std::make_pair(maxLocalVPressure, maxLocalSPressure), 0, 0 });
+    hotBlocks.push_back({&MBB, liveSet,
+                         std::make_pair(maxLocalVPressure, maxLocalSPressure),
+                         0, 0});
   }
   // Collect vmemLdInput/OutputSize.
   if (EnableVmemDegree) {
@@ -4541,8 +4507,8 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
   }
 
   if (EnableUniformVectorToScalar) {
-    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap, hotBlocks, LIS, MRI,
-                               SIRI, SIII, MLI)) {
+    if (rematUniformVgprToSgpr(Remat, MF, status, status.MBBPressureMap,
+                               hotBlocks, LIS, MRI, SIRI, SIII, MLI)) {
       // Rebuild LIS.
       LIS->reanalyze(MF);
       status = GetRematStatus(MF, MLI, LIS, MRI, ST);
@@ -4596,15 +4562,17 @@ bool GroupRemat(Remat *Remat, MachineFunction &MF, MachineLoopInfo *MLI, LiveInt
         PressureUnderLimitSet.insert(MBB);
       } else {
         if (MaxLocalVGPR < it.maxPressures.first)
-          it.maxPressures = std::make_pair(MaxLocalVGPR, it.maxPressures.second);
+          it.maxPressures =
+              std::make_pair(MaxLocalVGPR, it.maxPressures.second);
         if (MaxLocalSGPR < it.maxPressures.second)
           it.maxPressures = std::make_pair(it.maxPressures.first, MaxLocalSGPR);
       }
     }
   }
 
-  bool bUpdated = perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates,
-                                        ST, LIS, MLI, pDT, MRI, SIRI, SIII);
+  bool bUpdated =
+      perBlockPassthruRemat(Remat, hotBlocks, status, liveRegCandidates, ST,
+                            LIS, MLI, pDT, MRI, SIRI, SIII);
 
   return bUpdated;
 }
@@ -4613,8 +4581,10 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
   if (MF.size() < 2)
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachinePostDominatorTree *PDT =
+      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
   MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
@@ -4629,8 +4599,8 @@ bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  //LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
-  // For non-cs/ps, set target occ as 4.
+  // LLVM_DEBUG(pressure::write_pressure(MF, LIS, R"(D:\Temp\d.json)"));
+  //  For non-cs/ps, set target occ as 4.
   bool bNearTarget = false;
   bool bFinalUpdated = false;
   bool bUpdated = hotBlockRemat(this, MF, MLI, LIS, DT, PDT, bNearTarget);
@@ -4655,8 +4625,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, "AMDGPU rematerialize",
-                    false, false)
+INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE,
+                    "AMDGPU rematerialize", false, false)
 
 char AMDGPUHotBlockRematerialize::ID = 0;
 char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
@@ -4664,4 +4634,3 @@ char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID;
 FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() {
   return new AMDGPUHotBlockRematerialize();
 }
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
index 6f44fec082..5336fde4cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp
@@ -1,22 +1,21 @@
 ///////////////////////////////////////////////////////////////////////////////
 //                                                                           //
-// AMDGPUMIRUtils.cpp                                                          //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
+// AMDGPUMIRUtils.cpp // Copyright (C) Microsoft Corporation. All rights
+// reserved.                 // This file is distributed under the University of
+// Illinois Open Source     // License. See LICENSE.TXT for details. //
 //                                                                           //
 // Util functions for llvm MIR Passes.                                       //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,9 +25,9 @@
 
 #include "llvm/Support/Debug.h"
 
-#include "GCNRegPressure.h"
 #include "AMDGPUMIRUtils.h"
 #include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
 #include <unordered_set>
 
 #define DEBUG_TYPE "xb-mir-util"
@@ -48,7 +47,7 @@ public:
         phiInsts.insert(&I);
         unsigned Reg = I.getOperand(0).getReg();
         // Add incoming values.
-        for (unsigned i=1;i<I.getNumOperands();i+=2) {
+        for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
           MachineOperand &MO = I.getOperand(i);
           if (!MO.isReg())
             continue;
@@ -66,7 +65,8 @@ public:
   } /// Adds custom features for a visualization of the ScheduleDAG.
   void addCustomGraphFeatures(llvm::GraphWriter<CFGWithPhi *> &) const {}
   MachineFunction &F;
-  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>> blockToPhiInstsMap;
+  DenseMap<const MachineBasicBlock *, DenseSet<MachineInstr *>>
+      blockToPhiInstsMap;
   void dump();
 };
 
@@ -110,7 +110,8 @@ template <> struct DOTGraphTraits<CFGWithPhi *> : public DefaultDOTGraphTraits {
     return R;
   }
 
-  static std::string getNodeLabel(const MachineBasicBlock *BB, const CFGWithPhi *G) {
+  static std::string getNodeLabel(const MachineBasicBlock *BB,
+                                  const CFGWithPhi *G) {
     enum { MaxColumns = 8000 };
     std::string Str;
     raw_string_ostream OS(Str);
@@ -347,7 +348,7 @@ void andNotLiveRegSet(LiveSet &targetSet, const LiveSet &inputSet) {
 }
 
 MachineBasicBlock *split(MachineInstr *Inst) {
-  
+
   // Create the fall-through block.
   MachineBasicBlock *MBB = Inst->getParent();
   MachineFunction *MF = MBB->getParent();
@@ -462,9 +463,8 @@ bool reduceChannel(unsigned offset, MachineInstr &MI, const MCInstrDesc &desc,
                              .addImm(offset * LaneSize);
         MachineInstr *OffsetAddMI = OffsetAdd.getInstr();
         MachineBasicBlock::iterator InsertPoint =
-            llvm::FindOrCreateInsertionPointForSccDef(
-                MI.getParent(), MI, SIRI, SIII, &MRI
-            );
+            llvm::FindOrCreateInsertionPointForSccDef(MI.getParent(), MI, SIRI,
+                                                      SIII, &MRI);
         MI.getParent()->insert(InsertPoint, OffsetAddMI);
         SIII->legalizeOperands(*OffsetAddMI);
         OffsetOp->setReg(NewOffsetReg);
@@ -631,7 +631,7 @@ bool reach_blocks(MachineBasicBlock *BB, MachineDominatorTree *DT,
   return bCross;
 }
 
-}
+} // namespace llvm
 
 namespace llvm {
 void viewCFGWithPhi(llvm::MachineFunction &F) {
@@ -1520,12 +1520,12 @@ void write_pressure(MachineFunction &MF, LiveIntervals *LIS, raw_ostream &os) {
 }
 
 } // namespace pressure
-}// namespace llvm
+} // namespace llvm
 
 namespace {
 class ContributionList {
 public:
-  ContributionList(MachineFunction &MF) : MF(MF){};
+  ContributionList(MachineFunction &MF) : MF(MF) {};
   void build();
   bool propagateContribution();
   MachineFunction &MF;
@@ -1754,46 +1754,45 @@ void write_contribution_list(llvm::MachineFunction &MF, const char *Filename) {
 }
 } // namespace llvm
 
-static bool IsPhysReg(const MachineOperand &Op)
-{
-    return Op.isReg() && Op.getReg().isPhysical();
+static bool IsPhysReg(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg().isPhysical();
 }
 
 // Sometimes split bb uses physical registers defined in BB, have to add them to
 // live-in or the ir is malformed.
-void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB, const MachineRegisterInfo *MRI)
-{
-    // Initialize with current set of liveins. For new blocks this will be empty.
-    SmallDenseSet<unsigned, 8> DefSet;
-    for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins())
-    {
-        DefSet.insert(P.PhysReg);
-    }
+void llvm::UpdatePhysRegLiveInForBlock(MachineBasicBlock *NewBB,
+                                       const MachineRegisterInfo *MRI) {
+  // Initialize with current set of liveins. For new blocks this will be empty.
+  SmallDenseSet<unsigned, 8> DefSet;
+  for (const MachineBasicBlock::RegisterMaskPair &P : NewBB->liveins()) {
+    DefSet.insert(P.PhysReg);
+  }
 
-    for (auto &MI : *NewBB)
-    {
-        // Add all undefined physical registers to the live in set.
-        for (MachineOperand &Use : MI.operands())
-        {
-            // Only process physreg uses.
-            if (!IsPhysReg(Use) || !Use.isUse()) continue;
+  for (auto &MI : *NewBB) {
+    // Add all undefined physical registers to the live in set.
+    for (MachineOperand &Use : MI.operands()) {
+      // Only process physreg uses.
+      if (!IsPhysReg(Use) || !Use.isUse())
+        continue;
 
-            // Reserved regs do not need to be tracked through live-in sets.
-            unsigned Reg = Use.getReg();
-            if (Use.isImplicit() && MRI && MRI->isReserved(Reg)) continue;
+      // Reserved regs do not need to be tracked through live-in sets.
+      unsigned Reg = Use.getReg();
+      if (Use.isImplicit() && MRI && MRI->isReserved(Reg))
+        continue;
 
-            if (!DefSet.count(Reg))
-                NewBB->addLiveIn(Reg);
-        }
+      if (!DefSet.count(Reg))
+        NewBB->addLiveIn(Reg);
+    }
 
-        // Add all physical register defs (exlicit+implicit) to the def register set.
-        for (MachineOperand &Def : MI.operands()) 
-        {
-            // Only process physreg defs.
-            if (!IsPhysReg(Def) || !Def.isDef()) continue;
-            DefSet.insert(Def.getReg());
-        }
+    // Add all physical register defs (exlicit+implicit) to the def register
+    // set.
+    for (MachineOperand &Def : MI.operands()) {
+      // Only process physreg defs.
+      if (!IsPhysReg(Def) || !Def.isDef())
+        continue;
+      DefSet.insert(Def.getReg());
     }
+  }
 }
 
 void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
@@ -1829,50 +1828,41 @@ void llvm::BuildPhysRegLiveInForBlock(MachineBasicBlock *NewBB,
   }
 }
 
-MachineReg llvm::CreateVirtualRegForOperand(
-    MachineOpcode Opcode,
-    unsigned OpNum,
-    MachineFunction &MF
-)
-{
-    const TargetSubtargetInfo &ST = MF.getSubtarget();
-    const TargetRegisterInfo *TRI = ST.getRegisterInfo();
-    const TargetInstrInfo *TII = ST.getInstrInfo();
-    const MCInstrDesc &Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
-    if (!RC)
-    {
-        llvm::report_fatal_error("Unable to create virtual reg for instruction operand");
-    }
+MachineReg llvm::CreateVirtualRegForOperand(MachineOpcode Opcode,
+                                            unsigned OpNum,
+                                            MachineFunction &MF) {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  const TargetRegisterClass *RC = TII->getRegClass(Desc, OpNum, TRI, MF);
+  if (!RC) {
+    llvm::report_fatal_error(
+        "Unable to create virtual reg for instruction operand");
+  }
 
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    return MRI.createVirtualRegister(RC);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  return MRI.createVirtualRegister(RC);
 }
 
-MachineReg llvm::CreateVirtualDstReg(
-    MachineOpcode Opcode,
-    MachineFunction &MF
-)
-{
-    return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
+MachineReg llvm::CreateVirtualDstReg(MachineOpcode Opcode,
+                                     MachineFunction &MF) {
+  return llvm::CreateVirtualRegForOperand(Opcode, 0, MF);
 }
 
 // Return true if the MI is a copy of exec.
 // If true then sets pDst to the destination register.
-bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
-{
-    enum {DST=0, SRC=1};
-    bool FoundCopy = false;
-    if (MI.getOpcode() == AMDGPU::COPY
-        || MI.getOpcode() == AMDGPU::S_MOV_B32
-        || MI.getOpcode() == AMDGPU::S_MOV_B64)
-    {
-        const MachineOperand &Src = MI.getOperand(SRC);
-        if (Src.isReg() && Src.getReg() == Exec)
-        {
-            FoundCopy = true;
-        }
+bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec,
+                      MachineReg *pDst) {
+  enum { DST = 0, SRC = 1 };
+  bool FoundCopy = false;
+  if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::S_MOV_B32 ||
+      MI.getOpcode() == AMDGPU::S_MOV_B64) {
+    const MachineOperand &Src = MI.getOperand(SRC);
+    if (Src.isReg() && Src.getReg() == Exec) {
+      FoundCopy = true;
     }
+  }
 #if 0 // TODO: Delete this.
     else if (MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO ||
              MI.getOpcode() == AMDGPU::AMDGPU_GET_ENTRY_ACTIVE_MASK_PSEUDO_32)
@@ -1880,29 +1870,26 @@ bool llvm::IsExecCopy(const MachineInstr &MI, MachineReg Exec, MachineReg *pDst)
         FoundCopy = true;
     }
 #endif
-            
-    if (FoundCopy)
-    {
-        *pDst = MI.getOperand(DST).getReg();
-    }
 
-    return FoundCopy;
+  if (FoundCopy) {
+    *pDst = MI.getOperand(DST).getReg();
+  }
+
+  return FoundCopy;
 }
 
-llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF)
-{
-    llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister, AMDGPU::NoSubRegister};
-    if (MachineInstr* MI = GetWqmEntryActiveMaskInst(MF))
-    {
-        LiveLaneMask.Reg = MI->getOperand(0).getReg();
-        LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
-    }
+llvm::MachineRegWithSubReg llvm::GetWqmEntryActiveMask(MachineFunction &MF) {
+  llvm::MachineRegWithSubReg LiveLaneMask = {AMDGPU::NoRegister,
+                                             AMDGPU::NoSubRegister};
+  if (MachineInstr *MI = GetWqmEntryActiveMaskInst(MF)) {
+    LiveLaneMask.Reg = MI->getOperand(0).getReg();
+    LiveLaneMask.SubReg = MI->getOperand(0).getSubReg();
+  }
 
-    return LiveLaneMask;
+  return LiveLaneMask;
 }
 
-MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
-{
+MachineInstr *llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF) {
 #if 0 // TODO: Get rid of this
     // Look forward in the entry block for the SET_LIVE_LANE_MASK instruction.
     // This instruction is added by the SIWholeQuadMode pass.
@@ -1917,22 +1904,23 @@ MachineInstr* llvm::GetWqmEntryActiveMaskInst(MachineFunction &MF)
     }
 #endif
 
-    return nullptr;
+  return nullptr;
 }
 
-bool llvm::IsFetchShaderCall(const MachineInstr *MI)
-{
+bool llvm::IsFetchShaderCall(const MachineInstr *MI) {
 #if 0 // TODO: Get rid of this.
     return 
         MI->getOpcode() == AMDGPU::AMDGPU_CALL_FETCH_SHADER ||
         MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::FetchShaderCall);
 #else
-    return false;
+  return false;
 #endif
 }
 
-bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI) {
-  const TargetRegisterInfo* TRI = MBB->getParent()->getRegInfo().getTargetRegisterInfo();
+bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+                       llvm::MachineBasicBlock::iterator MI) {
+  const TargetRegisterInfo *TRI =
+      MBB->getParent()->getRegInfo().getTargetRegisterInfo();
   for (auto it = MI; it != MBB->end(); ++it) {
     const MachineInstr &CurMI = *it;
     // Hit use of scc, it is live.
@@ -1962,79 +1950,70 @@ bool llvm::IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::it
 // as the new insert location.
 //
 MachineBasicBlock::iterator llvm::FindOrCreateInsertionPointForSccDef(
-    MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator MI,
-    const TargetRegisterInfo* TRI,
-    const SIInstrInfo* TII,
-    MachineRegisterInfo* MRI,
-    SccDefInsertPointConstraintFlags Constraints
-)
-{
-    // If SCC is dead at MI when we can use MI as the insert point.
-    if (!llvm::IsSccLiveAt(MBB, MI))
-    {
-        return MI;
-    }
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MI,
+    const TargetRegisterInfo *TRI, const SIInstrInfo *TII,
+    MachineRegisterInfo *MRI, SccDefInsertPointConstraintFlags Constraints) {
+  // If SCC is dead at MI when we can use MI as the insert point.
+  if (!llvm::IsSccLiveAt(MBB, MI)) {
+    return MI;
+  }
 
-    const bool CheckForExecWrite =
-        Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
+  const bool CheckForExecWrite =
+      Constraints & SccDefInsertPointConstraintFlags::NoExecWrite;
 
-    // Get the starting reverse iterator taking care to handle the MBB->end() case.
-    MachineBasicBlock::reverse_iterator Start;
-    if (MI == MBB->end())
-    {
-        Start = MBB->rbegin();
-    }
-    else
-    {
-        Start = MI.getReverse();
-    }
-
-    // Otherwise, walk backwards through the block looking for a location where
-    // SCC is dead.
-    for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); It != End; ++It)
-    {
-        // If the instruction modifies exec then we cannot use it as
-        // an insertion point (if that is a constraint from the caller).
-        // The check for EXEC works for both wave64 and wave32 because
-        // it will also catch writes to the subregisters (e.g. exec_lo).
-        if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI))
-        {
-            break;
-        }
+  // Get the starting reverse iterator taking care to handle the MBB->end()
+  // case.
+  MachineBasicBlock::reverse_iterator Start;
+  if (MI == MBB->end()) {
+    Start = MBB->rbegin();
+  } else {
+    Start = MI.getReverse();
+  }
 
-        if (It->modifiesRegister(AMDGPU::SCC, TRI) 
-            && !It->readsRegister(AMDGPU::SCC, TRI))
-        {
-            return It->getIterator();
-        }
+  // Otherwise, walk backwards through the block looking for a location where
+  // SCC is dead.
+  for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend();
+       It != End; ++It) {
+    // If the instruction modifies exec then we cannot use it as
+    // an insertion point (if that is a constraint from the caller).
+    // The check for EXEC works for both wave64 and wave32 because
+    // it will also catch writes to the subregisters (e.g. exec_lo).
+    if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) {
+      break;
     }
 
-    // If no safe location can be found in the block we can save and restore
-    // SCC around MI. There is no way to directly read or write SCC so we use
-    // s_cselect to read the current value of SCC and s_cmp to write the saved
-    // value back to SCC.
-    //
-    // The generated code will look like this;
-    //
-    //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
-    //      <----- Newly created safe insert point.
-    //      MI
-    //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
-    //
-    unsigned int TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-    DebugLoc DL = MI->getDebugLoc();
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
-        .addImm(-1)
-        .addImm(0);
-    BuildMI(*MBB, std::next(MI->getIterator()), DL, TII->get(AMDGPU::S_CMP_LG_U32))
-        .addReg(TmpScc, RegState::Kill)
-        .addImm(0);
+    if (It->modifiesRegister(AMDGPU::SCC, TRI) &&
+        !It->readsRegister(AMDGPU::SCC, TRI)) {
+      return It->getIterator();
+    }
+  }
 
-    return MI;
+  // If no safe location can be found in the block we can save and restore
+  // SCC around MI. There is no way to directly read or write SCC so we use
+  // s_cselect to read the current value of SCC and s_cmp to write the saved
+  // value back to SCC.
+  //
+  // The generated code will look like this;
+  //
+  //      S_CSELECT_B32 %SavedSCC, -1, 0  # Save SCC
+  //      <----- Newly created safe insert point.
+  //      MI
+  //      S_CMP_LG_U32 %SavedSCC, 0       # Restore SCC
+  //
+  unsigned int TmpScc =
+      MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), TmpScc)
+      .addImm(-1)
+      .addImm(0);
+  BuildMI(*MBB, std::next(MI->getIterator()), DL,
+          TII->get(AMDGPU::S_CMP_LG_U32))
+      .addReg(TmpScc, RegState::Kill)
+      .addImm(0);
+
+  return MI;
 }
 
-
 namespace {
 bool isLocalSegment(const LiveRange::Segment *Seg, SlotIndexes *Indexes,
                     SmallDenseSet<MachineBasicBlock *, 2> &touchedMBBSet) {
@@ -2099,9 +2078,7 @@ bool llvm::isLocalLiveInterval(
   return isLocalLiveRange(&LI, Indexes, touchedMBBSet);
 }
 
-
-bool llvm::isLocalLiveInterval(
-    const LiveInterval &LI, SlotIndexes *Indexes) {
+bool llvm::isLocalLiveInterval(const LiveInterval &LI, SlotIndexes *Indexes) {
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges()) {
       if (!isLocalLiveRange(&S, Indexes))
@@ -2117,8 +2094,8 @@ bool llvm::isLocalLiveInterval(
 void llvm::buildEndLiveMap(
     llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
     const llvm::MachineRegisterInfo &MRI,
-    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
-        &MBBLiveMap, bool After) {
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap,
+    bool After) {
   // When only have one block, end live reg must be empty.
   if (MF.size() == 1)
     return;
@@ -2158,7 +2135,8 @@ void llvm::buildEndLiveMap(
   }
 }
 
-unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF,
+                                   const SIRegisterInfo *SIRI) {
   auto &MRI = MF.getRegInfo();
   for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
     if (MRI.isPhysRegUsed(Reg)) {
@@ -2168,14 +2146,16 @@ unsigned llvm::GetCurrentVGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
   return 0;
 }
 
-unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterInfo *SIRI) {
+unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF,
+                                   const SIRegisterInfo *SIRI) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned MaxSGPR = 0;
   for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
     if (MRI.isPhysRegUsed(Reg)) {
-      // Skip scratch reserved reg, which is a big register that don't really contribute to this stat.
+      // Skip scratch reserved reg, which is a big register that don't really
+      // contribute to this stat.
       if (ScratchRSrcReg != 0) {
         if (SIRI->isSubRegister(ScratchRSrcReg, Reg))
           continue;
@@ -2187,8 +2167,7 @@ unsigned llvm::GetCurrentSGPRCount(llvm::MachineFunction &MF, const SIRegisterIn
   return 1 + llvm::RegForVCC + MaxSGPR;
 }
 
-void llvm::dumpLiveSet(const LiveSet &LiveSet,
-                 const SIRegisterInfo *SIRI) {
+void llvm::dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) {
 
   dbgs() << "\n live set: \n";
   for (auto it : LiveSet) {
@@ -2227,15 +2206,16 @@ bool llvm::IsLdsSpillSupportedForHwStage(xmd::HwStage Stage)
 }
 #endif
 
-MachineBasicBlock::succ_iterator llvm::FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ)
-{
-    for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(), End = MBB->succ_end(); It != End; ++It)
-    {
-        if (*It == Succ)
-        {
-            return It;
-        }
+MachineBasicBlock::succ_iterator
+llvm::FindSuccessor(llvm::MachineBasicBlock *MBB,
+                    llvm::MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator It = MBB->succ_begin(),
+                                        End = MBB->succ_end();
+       It != End; ++It) {
+    if (*It == Succ) {
+      return It;
     }
+  }
 
-    return MBB->succ_end();
+  return MBB->succ_end();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
index 16b55c5c94..b077fad4c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h
@@ -2,9 +2,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/IR/CallingConv.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/LaneBitmask.h"
 
 namespace llvm {
 
@@ -37,10 +37,10 @@ using LiveSet = llvm::DenseMap<unsigned, llvm::LaneBitmask>;
 unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask,
                     const llvm::MachineRegisterInfo &MRI,
                     const llvm::SIRegisterInfo *SIRI);
-void CollectLiveSetPressure(
-    const LiveSet &liveSet,
-    const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
-    unsigned &VPressure, unsigned &SPressure);
+void CollectLiveSetPressure(const LiveSet &liveSet,
+                            const llvm::MachineRegisterInfo &MRI,
+                            const llvm::SIRegisterInfo *SIRI,
+                            unsigned &VPressure, unsigned &SPressure);
 
 bool isExecUpdateForControlFlow(llvm::MachineInstr &MI);
 
@@ -60,37 +60,34 @@ bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI,
                        const llvm::SIInstrInfo *TII,
                        llvm::SlotIndexes *SlotIndexes);
 
-bool reach_block(llvm::MachineBasicBlock *FromBB, llvm::MachineDominatorTree *DT,
+bool reach_block(llvm::MachineBasicBlock *FromBB,
+                 llvm::MachineDominatorTree *DT,
                  llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI,
                  llvm::MachineBasicBlock *ToBB);
 
-
 void viewCFGWithPhi(llvm::MachineFunction &MF);
 void write_contribution_list(llvm::MachineFunction &MF, const char *Filename);
 
-llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF, const llvm::SIInstrInfo *TII);
+llvm::MachineBasicBlock *CreateNullExportBlock(llvm::MachineFunction &MF,
+                                               const llvm::SIInstrInfo *TII);
 
 bool GetNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd,
                        llvm::MachineBasicBlock &MBB);
 
-void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB, const llvm::MachineRegisterInfo *MRI);
+void UpdatePhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
+                                 const llvm::MachineRegisterInfo *MRI);
 
 void BuildPhysRegLiveInForBlock(llvm::MachineBasicBlock *NewBB,
-                                 llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
-                                 const llvm::MachineRegisterInfo *MRI);
+                                llvm::SmallDenseSet<unsigned, 8> &LiveOutSet,
+                                const llvm::MachineRegisterInfo *MRI);
 
-MachineReg CreateVirtualRegForOperand(
-    MachineOpcode Opcode,
-    unsigned Operand,
-    llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualRegForOperand(MachineOpcode Opcode, unsigned Operand,
+                                      llvm::MachineFunction &MF);
 
-MachineReg CreateVirtualDstReg(
-    MachineOpcode Opcode,
-    llvm::MachineFunction &MF
-);
+MachineReg CreateVirtualDstReg(MachineOpcode Opcode, llvm::MachineFunction &MF);
 
-bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec, MachineReg *pDst);
+bool IsExecCopy(const llvm::MachineInstr &MI, MachineReg Exec,
+                MachineReg *pDst);
 struct MachineRegWithSubReg {
   MachineReg Reg = AMDGPU::NoRegister;
   unsigned SubReg = AMDGPU::NoSubRegister;
@@ -98,22 +95,22 @@ struct MachineRegWithSubReg {
 MachineRegWithSubReg GetWqmEntryActiveMask(llvm::MachineFunction &MF);
 llvm::MachineInstr *GetWqmEntryActiveMaskInst(llvm::MachineFunction &MF);
 
-// Return true if this machine instruction represents a call to the fetch shader.
-// We curently have two mechanisims for calling fetch shader:
+// Return true if this machine instruction represents a call to the fetch
+// shader. We curently have two mechanisims for calling fetch shader:
 // 1. The AMDGPU_CALL_FETCH_SHADER pseudo-instruction
 // 2. A CALL instruction with the `FetchShaderCall` flag set to true.
-bool IsFetchShaderCall(const llvm::MachineInstr* MI);
-
-bool IsSccLiveAt(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator MI);
+bool IsFetchShaderCall(const llvm::MachineInstr *MI);
 
+bool IsSccLiveAt(llvm::MachineBasicBlock *MBB,
+                 llvm::MachineBasicBlock::iterator MI);
 
 // An enum used to pass additional constraints to
 // `FindOrCreateInsertionPointForSccDef()`. This will further
 // constrain the location where the scc def can be inserted.
-enum SccDefInsertPointConstraintFlags
-{
-    None        = 0,   // No additional constraints.
-    NoExecWrite = 1,   // Should be no modification of exec between BeforeInst and insert point.
+enum SccDefInsertPointConstraintFlags {
+  None = 0,        // No additional constraints.
+  NoExecWrite = 1, // Should be no modification of exec between BeforeInst and
+                   // insert point.
 };
 
 // Look for a safe place to insert an instruction that defines scc.
@@ -130,55 +127,53 @@ enum SccDefInsertPointConstraintFlags
 // as the new insert location.
 //
 llvm::MachineBasicBlock::iterator FindOrCreateInsertionPointForSccDef(
-    llvm::MachineBasicBlock* MBB,
-    llvm::MachineBasicBlock::iterator BeforeInst,
-    const llvm::TargetRegisterInfo* TRI,
-    const llvm::SIInstrInfo* TII,
-    llvm::MachineRegisterInfo* MRI,
-    SccDefInsertPointConstraintFlags Constraints = SccDefInsertPointConstraintFlags::None
-);
+    llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst,
+    const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII,
+    llvm::MachineRegisterInfo *MRI,
+    SccDefInsertPointConstraintFlags Constraints =
+        SccDefInsertPointConstraintFlags::None);
 
 // Check if LI live cross basic blocks, save all touched basic block if is
 // local.
 bool isLocalLiveInterval(
     const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes,
     llvm::SmallDenseSet<llvm::MachineBasicBlock *, 2> &touchedMBBSet);
-bool isLocalLiveInterval(
-    const llvm::LiveInterval &LI, llvm::SlotIndexes *Indexes);
+bool isLocalLiveInterval(const llvm::LiveInterval &LI,
+                         llvm::SlotIndexes *Indexes);
 
 // build liveRegSet at end of each MBB.
 void buildEndLiveMap(
     llvm::LiveIntervals *LIS, llvm::MachineFunction &MF,
     const llvm::MachineRegisterInfo &MRI,
-    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet>
-        &MBBLiveMap, bool After);
+    llvm::DenseMap<llvm::MachineBasicBlock *, LiveSet> &MBBLiveMap, bool After);
 
-void dumpLiveSet(const LiveSet &LiveSet,
-                 const llvm::SIRegisterInfo *SIRI);
+void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI);
 
-unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
-unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF, const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentVGPRCount(llvm::MachineFunction &MF,
+                             const llvm::SIRegisterInfo *SIRI);
+unsigned GetCurrentSGPRCount(llvm::MachineFunction &MF,
+                             const llvm::SIRegisterInfo *SIRI);
 
 bool isFastMathInst(llvm::MachineInstr &MI);
 
 namespace pressure {
 void print_reg(llvm::Register Reg, const llvm::MachineRegisterInfo &MRI,
-               const llvm::SIRegisterInfo *SIRI,
-               llvm::raw_ostream &os);
+               const llvm::SIRegisterInfo *SIRI, llvm::raw_ostream &os);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
                     const char *Filename);
 void write_pressure(llvm::MachineFunction &MF, llvm::LiveIntervals *LIS,
                     llvm::raw_ostream &os);
-}
+} // namespace pressure
 // bool IsLdsSpillSupportedForHwStage(xmd::HwStage Stage);
 
 // Look for the successor `Succ` of the given `MBB`.
 // Returns MBB->succ_end() if `Succ` is not a successor of MBB.
-llvm::MachineBasicBlock::succ_iterator FindSuccessor(llvm::MachineBasicBlock* MBB, llvm::MachineBasicBlock* Succ);
+llvm::MachineBasicBlock::succ_iterator
+FindSuccessor(llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock *Succ);
 
 // The enum and helper function for v_perm selection mask.
 //
-// The input byte layout of v_perm is as below: 
+// The input byte layout of v_perm is as below:
 //
 // BYTE in[8]
 // in[0] = $src1_BYTE0;
@@ -211,7 +206,7 @@ constexpr int buildVPermSelectMask(V_PERM_IN_BYTE_POS Sel_0,
                                    V_PERM_IN_BYTE_POS Sel_1,
                                    V_PERM_IN_BYTE_POS Sel_2,
                                    V_PERM_IN_BYTE_POS Sel_3) {
-  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) |
-          ((int)Sel_1 << 8) | (int)Sel_0);
-}
+  return (((int)Sel_3 << 24) | ((int)Sel_2 << 16) | ((int)Sel_1 << 8) |
+          (int)Sel_0);
 }
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
index ceb22b5ff9..21aa5db0c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.cpp
@@ -69,7 +69,8 @@
 // ...
 //
 // label:
-// v3 = phi v0, v1                         ; divergent! because of divergent branch.
+// v3 = phi v0, v1                         ; divergent! because of divergent
+// branch.
 //
 // The boolean value is bit-divergent. When passed to the branch as an operand,
 // the branch becomes divergent, whose sync dependency will be computed as
@@ -81,13 +82,14 @@
 // control flow.
 // For case like
 //  %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
 //; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
 //  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
 //  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec
 //...
 //  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -164,20 +166,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPUMirDivergenceAnalysis.h"
-#include "GCNSubtarget.h"
+#include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "TargetInfo/AMDGPUTargetInfo.h"
-#include "SIInstrInfo.h"
-//#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
+// #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Support/Debug.h"
-//#include "newbe/cli/newbe_opts.h"  // AMDGPU change.
+// #include "newbe/cli/newbe_opts.h"  // AMDGPU change.
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
 
@@ -1223,24 +1225,24 @@ bool llvm::isAMDGPUOpcodeDivergent(class MachineInstr *MI) {
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_si:
   case AMDGPU::IMAGE_ATOMIC_AND_V2_V4_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_si:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_nsa_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V2_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_si:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V2_vi:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
-  //case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_nsa_gfx10:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_si:
+  // case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V1_V4_vi:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_nsa_gfx10:
   case AMDGPU::IMAGE_ATOMIC_CMPSWAP_V2_V4_si:
@@ -1555,8 +1557,8 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
     if (MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO ||
-        Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO)
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::VCC ||
+        Reg == AMDGPU::VCC_LO)
       return true;
 
     // Check if the written register class overlaps the bool register class.
@@ -1567,15 +1569,15 @@ bool writeBoolDst(const MachineInstr *MI, const SIRegisterInfo *SIRI,
     //
     // The underlying problem is that we have two notions of divergence
     // (bit divergence and wave divergence) but the algorithm only propagates
-    // wave divergence. The bit divergence is important for bools because it determines
-    // if a branch is uniform or not (and thus catches cases where a uniform value is
-    // used outside of a divergent control flow region). For bool values the
-    // algorithm will treat normally uniform values (i.e. scalar registers) as divergent
-    // in order to try and propagate bit divergence.
+    // wave divergence. The bit divergence is important for bools because it
+    // determines if a branch is uniform or not (and thus catches cases where a
+    // uniform value is used outside of a divergent control flow region). For
+    // bool values the algorithm will treat normally uniform values (i.e. scalar
+    // registers) as divergent in order to try and propagate bit divergence.
     //
-    // To fix all the possible bugs here I think we need to actually proagate bit
-    // divergence as well as wave divergences. That is a bigger fix and this check should
-    // cover most cases of treating a bool value as divergent.
+    // To fix all the possible bugs here I think we need to actually proagate
+    // bit divergence as well as wave divergences. That is a bigger fix and this
+    // check should cover most cases of treating a bool value as divergent.
     const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg);
     if (SIRI->getCommonSubClass(BoolRC, RC))
       return true;
@@ -1597,13 +1599,13 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
         !MI->isTerminator())
       return true;
     break;
-  //case AMDGPU::AMDGPU_MAKE_UNIFORM:
-  //case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
+  // case AMDGPU::AMDGPU_MAKE_UNIFORM:
+  // case AMDGPU::AMDGPU_WAVE_READ_LANE_FIRST:
   case AMDGPU::V_READFIRSTLANE_B32:
   case AMDGPU::V_READLANE_B32:
-  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
-  //case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
-    // bool readfirstlane should be 1 bit, which means bit uniform.
+    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W32:
+    // case AMDGPU::AMDGPU_WAVE_ACTIVE_BALLOT_W64:
+    //  bool readfirstlane should be 1 bit, which means bit uniform.
     return true;
   case AMDGPU::S_OR_B32:
   case AMDGPU::S_OR_B64: {
@@ -1638,7 +1640,8 @@ bool isAlwaysUniformMI(const MachineInstr *MI, const SIInstrInfo *SIII,
 }
 
 bool isPhysicalReg(MachineRegisterInfo &MRI, Register reg) {
-  return reg.isPhysical();;
+  return reg.isPhysical();
+  ;
 }
 
 bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
@@ -1646,13 +1649,14 @@ bool isRegClass(MachineRegisterInfo &MRI, unsigned reg, unsigned regClassID) {
 }
 
 // For input reg of MF, vgpr will be divergent.
-bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) {
+bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI,
+                         const SIRegisterInfo *SIRI) {
   if (isPhysicalReg(MRI, Reg)) {
     unsigned vir_reg = MRI.getLiveInVirtReg(Reg);
     if (SIRI->isVGPR(MRI, vir_reg))
       return true;
   } else {
-   if (SIRI->isVGPR(MRI, Reg))
+    if (SIRI->isVGPR(MRI, Reg))
       return true;
   }
   return false;
@@ -1660,8 +1664,8 @@ bool isDivergentInputReg(unsigned Reg, MachineRegisterInfo &MRI, const SIRegiste
 
 bool isSourceOfDivergence(MachineInstr *MI, MachineRegisterInfo &MRI,
                           const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
-  //if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
-  //  return true;
+  // if (MI->getAMDGPUFlag(MachineInstr::AMDGPUMIFlag::IsDivergent))
+  //   return true;
   if (isAMDGPUOpcodeDivergent(MI))
     return true;
 
@@ -1715,8 +1719,7 @@ bool isWriteExec(const MachineInstr *MI) {
     if (MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
-    if (Reg == AMDGPU::EXEC ||
-        Reg == AMDGPU::EXEC_LO)
+    if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
       return true;
   }
   return false;
@@ -1735,7 +1738,6 @@ bool isVCndMask(unsigned Opcode) {
   }
 }
 
-
 bool isExecRegionOp(unsigned Op) {
   switch (Op) {
   default:
@@ -1812,17 +1814,18 @@ bool isInsideExecRegion(const MachineBasicBlock &MBB,
   return PDT.dominates(RegionEndMBB, &MBB);
 }
 
-// Map from BB to nearest Exec Region. How to build? Add every MBB unless already has smaller region?
-// Then when hit saveExec, propagate leaked users of define inside the exec region.
+// Map from BB to nearest Exec Region. How to build? Add every MBB unless
+// already has smaller region? Then when hit saveExec, propagate leaked users of
+// define inside the exec region.
 
 } // namespace
 
 namespace llvm {
 // class DivergenceAnalysis
 DivergenceAnalysis::DivergenceAnalysis(
-    const MachineFunction &F, const MachineLoop *RegionLoop, const MachineDominatorTree &DT,
-    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI,
-    SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
+    const MachineFunction &F, const MachineLoop *RegionLoop,
+    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+    const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm,
     // AMDGPU change begin.
     DivergentJoinMapTy &JoinMap
     // AMDGPU change end.
@@ -1841,7 +1844,7 @@ void DivergenceAnalysis::markDivergent(const ValueTy DivVal) {
   LLVM_DEBUG(const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
              const SIRegisterInfo *SIRI = ST->getRegisterInfo();
              dbgs() << "\t MarkDivergent :"; printReg(DivVal, SIRI););
-  //AMDGPU change end.
+  // AMDGPU change end.
   DivergentValues.insert(DivVal);
 }
 
@@ -1948,7 +1951,7 @@ bool DivergenceAnalysis::updateTerminator(const MachineInstr &Term) const {
     // Check bit uniform here if not divergent.
     return !isBitUniform(Term, Processed);
   }
-  //case AMDGPU::AMDGPU_CALL_INDIRECT:
+  // case AMDGPU::AMDGPU_CALL_INDIRECT:
   case AMDGPU::SI_CALL:
     return true;
   }
@@ -1965,13 +1968,10 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
       continue;
     Register Reg = Op.getReg();
     if (Reg.isPhysical()) {
-      if (Reg == AMDGPU::EXEC ||
-          Reg == AMDGPU::EXEC_LO ||
-          Reg == AMDGPU::SCC)
+      if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::SCC)
         continue;
-      else 
-      if (const MachineInstr *DefMI =
-              findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
+      else if (const MachineInstr *DefMI =
+                   findPhysicalDefineInSameMBB(Op.getParent(), Reg)) {
         if (isDivergent(*DefMI))
           return true;
       } else {
@@ -1986,15 +1986,17 @@ bool DivergenceAnalysis::updateNormalInstruction(const MachineInstr &I) const {
   return false;
 }
 
-bool DivergenceAnalysis::isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
-                                             const ValueTy Val,
-                                             const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
-  const MachineBasicBlock *DefBlock = &IncomingBlock; // AMDGPU change: Take def point as incoming block for constants.
+bool DivergenceAnalysis::isTemporalDivergent(
+    const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+    const MachineBasicBlock &IncomingBlock) const { // AMDGPU change
+  const MachineBasicBlock *DefBlock =
+      &IncomingBlock; // AMDGPU change: Take def point as incoming block for
+                      // constants.
   const auto *Inst = MRI.getUniqueVRegDef(Val);
   if (Inst == nullptr)
     return true;
   if (Inst)
-      DefBlock = Inst->getParent(); 
+    DefBlock = Inst->getParent();
 
   // check whether any divergent loop carrying Val terminates before control
   // proceeds to ObservingBlock
@@ -2020,13 +2022,14 @@ static bool HasIncomingUndefValue(const PHINode_ *Phi) {
 
 // For case like
 //  %163:sreg_64_xexec = S_MOV_B64 $exec
-//bb.1:
+// bb.1:
 //; predecessors: %bb.1, %bb.0
-//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
-//  %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
+//  successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%),
+//  %bb.2(50.00%) %162:vreg_512 = PHI %41:vreg_512, %bb.0, %40:vreg_512, %bb.1
 //  %167:sgpr_32 = V_READFIRSTLANE_B32 %17:vgpr_32, implicit $exec
 //  %168:sreg_64 = V_CMP_EQ_U32_e64 %167:sgpr_32, %17:vgpr_32, implicit $exec
-//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
+//  %166:sreg_64 = S_AND_SAVEEXEC_B64 %168:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec
 //...
 //  $exec = S_XOR_B64_term $exec, %166:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -2091,8 +2094,8 @@ findSaveExec(const MachineInstr *MI,
 // It will only run on divergent branch, so (A, B) is not in
 // DivergentDisjointMap when A is uniform.
 static bool isJoinDivergentOnlyOnSameIncomingValue(
-    const PHINode_ &Phi, const DivergenceAnalysis *pDA, const MachineDominatorTree &DT,
-    DivergentJoinMapTy &DivergentJoinMap) {
+    const PHINode_ &Phi, const DivergenceAnalysis *pDA,
+    const MachineDominatorTree &DT, DivergentJoinMapTy &DivergentJoinMap) {
   // for phi which join divergent, if the incoming values from divergent
   // branch are the same, the phi is still uniform.
   // A
@@ -2183,14 +2186,14 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
   // joining divergent disjoint path in Phi parent block
   if (isJoinDivergent(*Phi.getParent())) {
     // AMDGPU CHANGE BEGIN
-    if (true/*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
+    if (true /*TODO: ENABLE_AGGRESSIVE_UNIFORM_ANALYSIS*/) {
       // Continue if the divergent join only on same incoming value.
       if (!isJoinDivergentOnlyOnSameIncomingValue(Phi, this, DT,
                                                   DivergentJoinMap))
         return true;
     } else
-    // AMDGPU CHANGE END
-    return true;
+      // AMDGPU CHANGE END
+      return true;
   }
 
   // An incoming value could be divergent by itself.
@@ -2213,7 +2216,6 @@ bool DivergenceAnalysis::updatePHINode(const PHINode_ &Phi) const {
     if (isDivergent(Reg) ||
         isTemporalDivergent(*Phi.getParent(), Reg, *BB.getMBB()))
       return true;
-
   }
 
   return false;
@@ -2259,7 +2261,8 @@ bool DivergenceAnalysis::inRegion(const MachineBasicBlock &BB) const {
 
 // marks all users of loop-carried values of the loop headed by LoopHeader as
 // divergent
-void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader) {
+void DivergenceAnalysis::taintLoopLiveOuts(
+    const MachineBasicBlock &LoopHeader) {
   auto *DivLoop = LI.getLoopFor(&LoopHeader);
   assert(DivLoop && "loopHeader is not actually part of a loop");
 
@@ -2324,7 +2327,7 @@ void DivergenceAnalysis::taintLoopLiveOuts(const MachineBasicBlock &LoopHeader)
   }
 }
 
-void DivergenceAnalysis::pushInstruction(const MachineInstr &I) { 
+void DivergenceAnalysis::pushInstruction(const MachineInstr &I) {
   Worklist.push_back(&I);
 }
 void DivergenceAnalysis::pushPHINodes(const MachineBasicBlock &Block) {
@@ -2355,8 +2358,8 @@ void DivergenceAnalysis::pushUsers(const MachineInstr &I) {
   }
 }
 
-bool DivergenceAnalysis::propagateJoinDivergence(const MachineBasicBlock &JoinBlock,
-                                                 const MachineLoop *BranchLoop) {
+bool DivergenceAnalysis::propagateJoinDivergence(
+    const MachineBasicBlock &JoinBlock, const MachineLoop *BranchLoop) {
   LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
 
   // ignore divergence outside the region
@@ -2403,8 +2406,10 @@ void DivergenceAnalysis::propagateBranchDivergence(const MachineInstr &Term) {
   }
 }
 
-void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop) {
-  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber() << "\n");
+void DivergenceAnalysis::propagateLoopDivergence(
+    const MachineLoop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getHeader()->getNumber()
+                    << "\n");
 
   // don't propagate beyond region
   if (!inRegion(*ExitingLoop.getHeader()))
@@ -2444,20 +2449,21 @@ void DivergenceAnalysis::propagateLoopDivergence(const MachineLoop &ExitingLoop)
 // For case like
 //  %149:sreg_64_xexec = S_MOV_B64 $exec
 //
-//bb.3:
+// bb.3:
 //; predecessors: %bb.3, %bb.2
-//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%), %bb.4(50.00%)
+//  successors: %bb.3(0x40000000), %bb.4(0x40000000); %bb.3(50.00%),
+//  %bb.4(50.00%)
 //
 //  %148:vreg_512 = PHI %56:vreg_512, %bb.2, %55:vreg_512, %bb.3
 //  %153:sgpr_32 = V_READFIRSTLANE_B32 %36:vgpr_32, implicit $exec
 //  %154:sreg_64 = V_CMP_EQ_U32_e64 %153:sgpr_32, %36:vgpr_32, implicit $exec
-//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec
-//  $m0 = S_MOV_B32 %153:sgpr_32
-//  %55:vreg_512 = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit $exec
-//  $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
+//  %152:sreg_64 = S_AND_SAVEEXEC_B64 %154:sreg_64, implicit-def $exec,
+//  implicit-def $scc, implicit $exec $m0 = S_MOV_B32 %153:sgpr_32 %55:vreg_512
+//  = V_MOVRELD_B32_V16 %148:vreg_512(tied-def 0), -2, 0, implicit $m0, implicit
+//  $exec $exec = S_XOR_B64_term $exec, %152:sreg_64, implicit-def $scc
 //  S_CBRANCH_EXECNZ %bb.3, implicit $exec
 //
-//bb.4:
+// bb.4:
 //; predecessors: %bb.3
 //  successors: %bb.5(0x80000000); %bb.5(100.00%)
 //
@@ -2596,7 +2602,7 @@ void DivergenceAnalysis::compute() {
 
   // propagate divergence
   while (!Worklist.empty()) {
-    const MachineInstr *I= Worklist.back();
+    const MachineInstr *I = Worklist.back();
     Worklist.pop_back();
 
     // maintain uniformity of overrides
@@ -2715,23 +2721,23 @@ bool DivergenceAnalysis::isDivergent(const MachineInstr &I) const {
 void DivergenceAnalysis::print(raw_ostream &OS, const Module_ *) const {
   // iterate instructions using instructions() to ensure a deterministic order.
   for (auto &MBB : F)
-  for (auto &I : MBB) {
-    if (isDivergent(I))
-      OS << "DIVERGENT:" << I ;
-    // AMDGPU changes begin
-    else
-      OS << "UNIFORM:" << I ;
-    // AMDGPU changes end
-  }
+    for (auto &I : MBB) {
+      if (isDivergent(I))
+        OS << "DIVERGENT:" << I;
+      // AMDGPU changes begin
+      else
+        OS << "UNIFORM:" << I;
+      // AMDGPU changes end
+    }
 }
 
 // class GPUDivergenceAnalysis
-MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(MachineFunction &F,
-                                             const MachineDominatorTree &DT,
-                                             const MachinePostDominatorTree &PDT,
-                                             const MachineLoopInfo &LI)
-    : SDA(DT, PDT, LI, /*AMDGPU change*/DivergentJoinMap),
-      DA(F, nullptr, DT, PDT, LI, SDA, false, /*AMDGPU change*/DivergentJoinMap) {
+MirGPUDivergenceAnalysis::MirGPUDivergenceAnalysis(
+    MachineFunction &F, const MachineDominatorTree &DT,
+    const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+    : SDA(DT, PDT, LI, /*AMDGPU change*/ DivergentJoinMap),
+      DA(F, nullptr, DT, PDT, LI, SDA, false,
+         /*AMDGPU change*/ DivergentJoinMap) {
   MachineRegisterInfo &MRI = F.getRegInfo();
   const GCNSubtarget *ST = &F.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *SIRI = ST->getRegisterInfo();
@@ -2758,10 +2764,11 @@ bool MirGPUDivergenceAnalysis::isDivergent(const MachineInstr *I) const {
   return DA.isDivergent(*I);
 }
 
-void MirGPUDivergenceAnalysis::print(raw_ostream &OS, const Module_ *mod) const {
+void MirGPUDivergenceAnalysis::print(raw_ostream &OS,
+                                     const Module_ *mod) const {
   OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
   DA.print(OS, mod);
   OS << "}\n";
 }
 
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
index edcf96ec44..d9fd4044c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirDivergenceAnalysis.h
@@ -1,4 +1,5 @@
-//===- AMDGPUMirDivergenceAnalysis.h -        Mir Divergence Analysis -*- C++ -*-===//
+//===- AMDGPUMirDivergenceAnalysis.h -        Mir Divergence Analysis -*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,11 +15,11 @@
 
 #pragma once
 
-#include "llvm/ADT/DenseSet.h"
+#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/Pass.h"
 #include <vector>
 
@@ -50,8 +51,10 @@ public:
   /// Otherwise the whole function is analyzed.
   /// \param IsLCSSAForm whether the analysis may assume that the IR in the
   /// region in in LCSSA form.
-  DivergenceAnalysis(const llvm::MachineFunction &F, const MachineLoop *RegionLoop,
-                     const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+  DivergenceAnalysis(const llvm::MachineFunction &F,
+                     const MachineLoop *RegionLoop,
+                     const MachineDominatorTree &DT,
+                     const MachinePostDominatorTree &PDT,
                      const MachineLoopInfo &LI, SyncDependenceAnalysis &SDA,
                      bool IsLCSSAForm,
                      // AMDGPU change begin.
@@ -98,10 +101,12 @@ private:
   bool updateTerminator(const MachineInstr &Term) const;
   bool updatePHINode(const PHINode_ &Phi) const;
   bool updateVCndMask(const MachineInstr &VCndMask) const;
-  bool isBitUniform(const MachineInstr &I,
-                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
-  bool isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
-                    llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool
+  isBitUniform(const MachineInstr &I,
+               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
+  bool
+  isBitUniform(const MachineInstr &I, const llvm::MachineOperand &UseMO,
+               llvm::DenseMap<const MachineInstr *, bool> &Processed) const;
 
   /// \brief Computes whether \p Inst is divergent based on the
   /// divergence of its operands.
@@ -136,9 +141,9 @@ private:
   }
 
   /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
-  bool isTemporalDivergent(const MachineBasicBlock &ObservingBlock,
-                           const ValueTy Val,
-                           const MachineBasicBlock &incomingBlock) const; // AMDGPU change
+  bool isTemporalDivergent(
+      const MachineBasicBlock &ObservingBlock, const ValueTy Val,
+      const MachineBasicBlock &incomingBlock) const; // AMDGPU change
 
   /// \brief Whether \p Block is join divergent
   ///
@@ -207,14 +212,14 @@ private:
 
   // Set of known-uniform values.
   llvm::DenseSet<unsigned> UniformOverrides;
-  llvm::DenseSet<const llvm::MachineInstr*> UniformOverridesInsts;
+  llvm::DenseSet<const llvm::MachineInstr *> UniformOverridesInsts;
 
   // Blocks with joining divergent control from different predecessors.
   llvm::DenseSet<const MachineBasicBlock *> DivergentJoinBlocks;
 
   // Detected/marked divergent values.
   llvm::DenseSet<unsigned> DivergentValues;
-  llvm::DenseSet<const llvm::MachineInstr*> DivergentInsts;
+  llvm::DenseSet<const llvm::MachineInstr *> DivergentInsts;
 
   // Mir change for EXEC control flow.
   // Map from MBB to the exec region it belongs too.
@@ -226,16 +231,15 @@ private:
   struct ExecRegion {
     const llvm::MachineInstr *begin;
     const llvm::MachineInstr *end;
-    std::vector<const llvm::MachineBasicBlock*> blocks;
+    std::vector<const llvm::MachineBasicBlock *> blocks;
     bool bPropagated = false;
-    ExecRegion(const llvm::MachineInstr *b,
-               const llvm::MachineInstr *e)
+    ExecRegion(const llvm::MachineInstr *b, const llvm::MachineInstr *e)
         : begin(b), end(e), bPropagated(false) {}
   };
   llvm::DenseMap<const llvm::MachineBasicBlock *, ExecRegion *> ExecRegionMap;
 
   // Internal worklist for divergence propagation.
-  std::vector<const llvm::MachineInstr*> Worklist;
+  std::vector<const llvm::MachineInstr *> Worklist;
 };
 
 /// \brief Divergence analysis frontend for GPU kernels.
@@ -251,15 +255,17 @@ class MirGPUDivergenceAnalysis {
   // When A is divergent branch, B and C are divergent join at D.
   // Then DivergentJoinMap[B].count(C) > 0 and
   // DivergentJoinMap[C].count(B) > 0.
-  DivergentJoinMapTy  DivergentJoinMap;
+  DivergentJoinMapTy DivergentJoinMap;
   // AMDGPU change end
   SyncDependenceAnalysis SDA;
   DivergenceAnalysis DA;
 
 public:
   /// Runs the divergence analysis on @F, a GPU kernel
-  MirGPUDivergenceAnalysis(llvm::MachineFunction &F, const MachineDominatorTree &DT,
-                        const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI);
+  MirGPUDivergenceAnalysis(llvm::MachineFunction &F,
+                           const MachineDominatorTree &DT,
+                           const MachinePostDominatorTree &PDT,
+                           const MachineLoopInfo &LI);
 
   /// Whether any divergence was detected.
   bool hasDivergence() const { return DA.hasDetectedDivergence(); }
@@ -278,4 +284,3 @@ public:
 };
 
 } // namespace llvm
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
index 7213f7b4b1..302939c76a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.cpp
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence Calculation
+//===- MirSyncDependenceAnalysis.cpp - Mir Divergent Branch Dependence
+//Calculation
 //--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -101,15 +102,15 @@
 // loop exit and the loop header (_after_ SSA construction).
 //
 //===----------------------------------------------------------------------===//
+#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "AMDGPUMirSyncDependenceAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 
 #include <stack>
 #include <unordered_set>
@@ -120,19 +121,18 @@ namespace llvm {
 
 ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
 
-SyncDependenceAnalysis::SyncDependenceAnalysis(const MachineDominatorTree &DT,
-                                               const MachinePostDominatorTree &PDT,
-                                               const MachineLoopInfo &LI,
-                                               // AMDGPU change begin.
-                                               DivergentJoinMapTy &JoinMap
-                                               // AMDGPU change end.
+SyncDependenceAnalysis::SyncDependenceAnalysis(
+    const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+    const MachineLoopInfo &LI,
+    // AMDGPU change begin.
+    DivergentJoinMapTy &JoinMap
+    // AMDGPU change end.
     )
     : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI),
-    // AMDGPU change begin.
+      // AMDGPU change begin.
       DivergentJoinMap(JoinMap)
-    // AMDGPU change end.
-{
-}
+// AMDGPU change end.
+{}
 
 SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
 
@@ -155,19 +155,23 @@ struct DivergencePropagator {
   // if DefMap[B] ~ undef then we haven't seen B yet
   // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
   // an immediate successor of X (initial value).
-  using DefiningBlockMap = std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
+  using DefiningBlockMap =
+      std::map<const MachineBasicBlock *, const MachineBasicBlock *>;
   DefiningBlockMap DefMap;
 
   // all blocks with pending visits
   std::unordered_set<const MachineBasicBlock *> PendingUpdates;
 
-  DivergencePropagator(const FunctionRPOT &FuncRPOT, const MachineDominatorTree &DT,
-                       const MachinePostDominatorTree &PDT, const MachineLoopInfo &LI)
+  DivergencePropagator(const FunctionRPOT &FuncRPOT,
+                       const MachineDominatorTree &DT,
+                       const MachinePostDominatorTree &PDT,
+                       const MachineLoopInfo &LI)
       : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
         JoinBlocks(new ConstBlockSet) {}
 
   // set the definition at @block and mark @block as pending for a visit
-  void addPending(const MachineBasicBlock &Block, const MachineBasicBlock &DefBlock) {
+  void addPending(const MachineBasicBlock &Block,
+                  const MachineBasicBlock &DefBlock) {
     bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
     if (WasAdded)
       PendingUpdates.insert(&Block);
@@ -190,7 +194,8 @@ struct DivergencePropagator {
 
   // process @succBlock with reaching definition @defBlock
   // the original divergent branch was in @parentLoop (if any)
-  void visitSuccessor(const MachineBasicBlock &SuccBlock, const MachineLoop *ParentLoop,
+  void visitSuccessor(const MachineBasicBlock &SuccBlock,
+                      const MachineLoop *ParentLoop,
                       const MachineBasicBlock &DefBlock) {
 
     // @succBlock is a loop exit
@@ -223,14 +228,14 @@ struct DivergencePropagator {
   // divergent exits.
   // @rootBlock is either the block containing the branch or the header of the
   // divergent loop.
-  // @nodeSuccessors is the set of successors of the node (MachineLoop or Terminator)
-  // headed by @rootBlock.
-  // @parentLoop is the parent loop of the MachineLoop or the loop that contains the
-  // Terminator.
+  // @nodeSuccessors is the set of successors of the node (MachineLoop or
+  // Terminator) headed by @rootBlock.
+  // @parentLoop is the parent loop of the MachineLoop or the loop that contains
+  // the Terminator.
   template <typename SuccessorIterable>
-  std::unique_ptr<ConstBlockSet>
-  computeJoinPoints(const MachineBasicBlock &RootBlock,
-                    SuccessorIterable NodeSuccessors, const MachineLoop *ParentLoop, const MachineBasicBlock * PdBoundBlock) {
+  std::unique_ptr<ConstBlockSet> computeJoinPoints(
+      const MachineBasicBlock &RootBlock, SuccessorIterable NodeSuccessors,
+      const MachineLoop *ParentLoop, const MachineBasicBlock *PdBoundBlock) {
     assert(JoinBlocks);
 
     // bootstrap with branch targets
@@ -250,7 +255,8 @@ struct DivergencePropagator {
     auto ItBeginRPO = FuncRPOT.begin();
 
     // skip until term (TODO RPOT won't let us start at @term directly)
-    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
+    }
 
     auto ItEndRPO = FuncRPOT.end();
     assert(ItBeginRPO != ItEndRPO);
@@ -337,30 +343,26 @@ struct DivergencePropagator {
       //     |  B   C
       //     |  | / |
       //     +--L   P
-      //   
+      //
       // In this cfg, C is the RootBlock and P is C's post-dominator.
       // It will only visit L and P and then stop because it hits the
       // post dominator. Most loops do not hit this case because the
       // loop exiting block (C) will branch directly back to the loop
       // header.
-      // 
-      if (HeaderDefBlock)
-      {
-          for (const auto *ExitBlock : ReachedLoopExits) {
-            auto ItExitDef = DefMap.find(ExitBlock);
-            assert((ItExitDef != DefMap.end()) &&
-                   "no reaching def at reachable loop exit");
-            if (ItExitDef->second != HeaderDefBlock) {
-              JoinBlocks->insert(ExitBlock);
-            }
-          }
-      }
-      else
-      {
-          for (const auto *ExitBlock : ReachedLoopExits)
-          {
-              JoinBlocks->insert(ExitBlock);
+      //
+      if (HeaderDefBlock) {
+        for (const auto *ExitBlock : ReachedLoopExits) {
+          auto ItExitDef = DefMap.find(ExitBlock);
+          assert((ItExitDef != DefMap.end()) &&
+                 "no reaching def at reachable loop exit");
+          if (ItExitDef->second != HeaderDefBlock) {
+            JoinBlocks->insert(ExitBlock);
           }
+        }
+      } else {
+        for (const auto *ExitBlock : ReachedLoopExits) {
+          JoinBlocks->insert(ExitBlock);
+        }
       }
     }
 
@@ -370,12 +372,14 @@ struct DivergencePropagator {
 
 // AMDGPU change begin.
 // For all join blocks caused by divergent RootBlock, the prevs of a join block
-// which are in DefMap or the RootBlock are divergent join each other on the join block because
-// of divergent RootBlock.
-static void updateJoinMap(
-    const MachineBasicBlock *RootBlock,
-    DenseMap<const MachineBasicBlock *, SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
-    DivergencePropagator::DefiningBlockMap &DefMap, ConstBlockSet &JoinBlocks) {
+// which are in DefMap or the RootBlock are divergent join each other on the
+// join block because of divergent RootBlock.
+static void
+updateJoinMap(const MachineBasicBlock *RootBlock,
+              DenseMap<const MachineBasicBlock *,
+                       SmallPtrSet<const MachineBasicBlock *, 4>> &JoinMap,
+              DivergencePropagator::DefiningBlockMap &DefMap,
+              ConstBlockSet &JoinBlocks) {
   for (const MachineBasicBlock *JoinBB : JoinBlocks) {
     // makr divergent join for all pred pair which in DefMap.
     for (auto predIt = JoinBB->pred_begin(); predIt != JoinBB->pred_end();
@@ -400,7 +404,8 @@ static void updateJoinMap(
 }
 // AMDGPU change end.
 
-const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const MachineLoop &MachineLoop) {
   using LoopExitVec = SmallVector<MachineBasicBlock *, 4>;
   LoopExitVec LoopExits;
   MachineLoop.getExitBlocks(LoopExits);
@@ -415,7 +420,8 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
   }
 
   // dont propagte beyond the immediate post dom of the loop
-  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
+  const auto *PdNode =
+      PDT.getNode(const_cast<MachineBasicBlock *>(MachineLoop.getHeader()));
   const auto *IpdNode = PdNode->getIDom();
   const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
   while (PdBoundBlock && MachineLoop.contains(PdBoundBlock)) {
@@ -426,15 +432,17 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const MachineLoop &Mach
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(), PdBoundBlock);
+      *MachineLoop.getHeader(), LoopExits, MachineLoop.getParentLoop(),
+      PdBoundBlock);
 
   // AMDGPU change begin.
   // Save divergent join pairs.
   updateJoinMap(MachineLoop.getHeader(), DivergentJoinMap, Propagator.DefMap,
-                    *JoinBlocks.get());
+                *JoinBlocks.get());
   // AMDGPU change end.
 
-  auto ItInserted = CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
+  auto ItInserted =
+      CachedLoopExitJoins.emplace(&MachineLoop, std::move(JoinBlocks));
   assert(ItInserted.second);
   return *ItInserted.first->second;
 }
@@ -452,18 +460,18 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
     return *ItCached->second;
 
   // dont propagate beyond the immediate post dominator of the branch
-  const auto *PdNode = PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
+  const auto *PdNode =
+      PDT.getNode(const_cast<MachineBasicBlock *>(Term.getParent()));
   const auto *IpdNode = PdNode->getIDom();
   const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  
 
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   const auto &TermBlock = *Term.getParent();
-  
+
   // AMDGPU CHANGE
   // Make sure the post-dominator is outside the loop for the loop header.
-  // Otherwise, we may not find all the join blocks in the loop 
+  // Otherwise, we may not find all the join blocks in the loop
   // because the search stops too early. Some join points can be reached
   // after the post-dominator!
   //
@@ -477,30 +485,30 @@ SyncDependenceAnalysis::join_blocks(const MachineInstr &Term) {
   //
   // In this cfg, A is the loop header and P is A's post-dominator.
   // The algorithm to mark join points does an Reverse Post Order walk
-  // from A and stops when it reaches the post dominator. It would not 
+  // from A and stops when it reaches the post dominator. It would not
   // mark the phi node in L as divergent even when A had a divergent branch.
   // The fix we made was to make the join point search continue all the way
   // to the loops post dominator (which is X in this example).
   //
   // NOTE: They already made this change for the loop case above, but for
-  //       a different bug apparently. See SyncDependenceAnalysis::join_blocks(MachineLoop&)
-  //   
+  //       a different bug apparently. See
+  //       SyncDependenceAnalysis::join_blocks(MachineLoop&)
+  //
   const MachineLoop *MachineLoop = LI.getLoopFor(&TermBlock);
-  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock))
-  {
-      while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
-        IpdNode = IpdNode->getIDom();
-        PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-      }
+  if (MachineLoop && (MachineLoop->getHeader() == &TermBlock)) {
+    while (PdBoundBlock && MachineLoop->contains(PdBoundBlock)) {
+      IpdNode = IpdNode->getIDom();
+      PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+    }
   }
- 
+
   auto JoinBlocks = Propagator.computeJoinPoints(
       TermBlock, Term.getParent()->successors(), MachineLoop, PdBoundBlock);
 
   // AMDGPU change begin.
   // Save divergent join pairs.
   updateJoinMap(&TermBlock, DivergentJoinMap, Propagator.DefMap,
-                    *JoinBlocks.get());
+                *JoinBlocks.get());
   // AMDGPU change end.
 
   auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
index a52bcc7bc9..321fcf5e6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMirSyncDependenceAnalysis.h
@@ -1,4 +1,5 @@
-//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++ -*-===//
+//===- MirSyncDependenceAnalysis.h - MirDivergent Branch Dependence -*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,8 +21,8 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include <memory>
 #include <map>
+#include <memory>
 
 namespace llvm {
 class MachineBasicBlock;
@@ -44,14 +45,16 @@ using ConstBlockSet = llvm::SmallPtrSet<const MachineBasicBlock *, 4>;
 /// This analysis relates points of divergent control to points of converging
 /// divergent control. The analysis requires all loops to be reducible.
 class SyncDependenceAnalysis {
-  void visitSuccessor(const MachineBasicBlock &succBlock, const MachineLoop *termLoop,
+  void visitSuccessor(const MachineBasicBlock &succBlock,
+                      const MachineLoop *termLoop,
                       const MachineBasicBlock *defBlock);
 
 public:
   bool inRegion(const MachineBasicBlock &BB) const;
 
   ~SyncDependenceAnalysis();
-  SyncDependenceAnalysis(const MachineDominatorTree &DT, const MachinePostDominatorTree &PDT,
+  SyncDependenceAnalysis(const MachineDominatorTree &DT,
+                         const MachinePostDominatorTree &PDT,
                          const MachineLoopInfo &LI,
                          // AMDGPU change begin
                          DivergentJoinMapTy &JoinMap
@@ -88,11 +91,10 @@ private:
   // AMDGPU change begin.
   DivergentJoinMapTy &DivergentJoinMap;
   // AMDGPU change end.
-  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const MachineLoop *, std::unique_ptr<ConstBlockSet>>
+      CachedLoopExitJoins;
   std::map<const MachineInstr *, std::unique_ptr<ConstBlockSet>>
       CachedBranchJoins;
 };
 
 } // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
index 648df7f724..49a8e4f076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp
@@ -1,4 +1,5 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and
+//latency --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 //
 //===--------------------------------------------------------------------------------===//
 
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPUOccupancyAndLatencyHelper.h"
 #include "AMDGPUSubtarget.h"
 #include "GCNSubtarget.h"
-#include "AMDGPUOccupancyAndLatencyHelper.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
 
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
@@ -57,7 +58,7 @@ bool SchedScore::isBetter(const SchedScore &s) const {
 bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
   unsigned gain = latencyGain(TargetOccupancy, ExtraOcc);
   // 10% is good enough.
-  if ((10*gain) >= Alu)
+  if ((10 * gain) >= Alu)
     return true;
   else
     return false;
@@ -65,7 +66,7 @@ bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const {
 
 unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const {
   unsigned latency = MemLatency;
-  return (latency / (TgtOcc))- (latency / (TgtOcc + ExtraOcc));
+  return (latency / (TgtOcc)) - (latency / (TgtOcc + ExtraOcc));
 }
 
 // AMDGPULatencyTracker
@@ -73,7 +74,8 @@ AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST)
     : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {}
 
 void AMDGPULatencyTracker::scan(const MachineInstr &MI) {
-  if (MI.isDebugInstr()) return;
+  if (MI.isDebugInstr())
+    return;
   int latency = SIII->getInstrLatency(ItinerayData, MI);
   // If inside latency hide.
   if (!LatencyMIs.empty()) {
@@ -184,5 +186,3 @@ SchedScore CollectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST,
 }
 
 } // namespace llvm
-
-
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
index f108bab24b..7444f63845 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h
@@ -1,4 +1,5 @@
-//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and latency --===//
+//===-- AMDGPUOccupancyAndLatencyHelper - Helper functions for occupancy and
+//latency --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -30,7 +31,7 @@ struct SchedScore {
   unsigned MemLatency = 0;  // Only save mem latency.
                             // We want mem latency small and hide big. Compare
                             // memLatency - hide * Occ, smaller is better.
-  unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1.
+  unsigned MixAlu = 0;      // VAlu and SAlu can running parallel if Occ > 1.
   unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy.
   unsigned Lds = 0; // Todo: count lds.
   SchedScore() {}
@@ -39,9 +40,9 @@ struct SchedScore {
   float computeScore() const;
   float computeScore2() const;
 
-  void sum(const SchedScore &s, unsigned loopDepth=0);
+  void sum(const SchedScore &s, unsigned loopDepth = 0);
   bool isBetter(const SchedScore &s) const;
-  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc=1) const;
+  bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const;
   // More latency can be hiden with ExtraOcc.
   unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const;
 };
@@ -71,4 +72,4 @@ struct AMDGPULatencyTracker {
 SchedScore CollectLatency(llvm::MachineFunction &MF,
                           const llvm::GCNSubtarget &ST,
                           const llvm::MachineLoopInfo *MLI = nullptr);
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
index a0f2a5d4dc..6f2200d8f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.cpp
@@ -1,9 +1,9 @@
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 
-//#include "dxc/DXIL/DxilMetadataHelper.h"
+// #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -14,9 +14,9 @@
 
 #include "llvm/Support/Debug.h"
 
-#include "GCNRegPressure.h"
 #include "AMDGPUMIRUtils.h"
 #include "AMDGPUSubExpDag.h"
+#include "GCNRegPressure.h"
 #include <unordered_set>
 
 #define DEBUG_TYPE "xb-sub-exp-dag"
@@ -27,37 +27,35 @@ namespace llvm {
 // Expression Dag.
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SubExp::dump(const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI) const {
-    dbgs() << "\nSubExp:\n";
-    dbgs() << "input regs:\n";
-    for (auto &input : inputLive) {
-      pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
-      dbgs() << "\n";
-    }
-    dbgs() << "output regs:\n";
-    for (auto &output : outputLive) {
-      pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
-      dbgs() << "\n";
-    }
+void SubExp::dump(const MachineRegisterInfo &MRI,
+                  const SIRegisterInfo *SIRI) const {
+  dbgs() << "\nSubExp:\n";
+  dbgs() << "input regs:\n";
+  for (auto &input : inputLive) {
+    pressure::print_reg(input.first, MRI, SIRI, llvm::dbgs());
+    dbgs() << "\n";
+  }
+  dbgs() << "output regs:\n";
+  for (auto &output : outputLive) {
+    pressure::print_reg(output.first, MRI, SIRI, llvm::dbgs());
+    dbgs() << "\n";
+  }
 
-    for (MachineInstr *MI : SUnits) {
-      MI->dump();
-    }
-    dbgs() << "End of SubExp\n";
+  for (MachineInstr *MI : SUnits) {
+    MI->dump();
+  }
+  dbgs() << "End of SubExp\n";
 }
 #endif
 
-bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo* SIRI) const
-{
-    for (const MachineInstr *MI : SUnits)
-    {
-        if (MI->modifiesRegister(Reg, SIRI))
-        {
-            return true;
-        }
+bool SubExp::modifiesRegister(unsigned Reg, const SIRegisterInfo *SIRI) const {
+  for (const MachineInstr *MI : SUnits) {
+    if (MI->modifiesRegister(Reg, SIRI)) {
+      return true;
     }
+  }
 
-    return false;
+  return false;
 }
 
 void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
@@ -95,7 +93,9 @@ void SubExp::calcMaxPressure(const MachineRegisterInfo &MRI,
 
   for (auto it = SUnits.rbegin(); it != SUnits.rend(); it++) {
     MachineInstr *MI = *it;
-    auto *ST = &MI->getMF()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+    auto *ST =
+        &MI->getMF()
+             ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
     for (MachineOperand &MO : MI->operands()) {
       if (!MO.isReg())
         continue;
@@ -149,8 +149,8 @@ bool SubExp::isSafeToMove(const MachineRegisterInfo &MRI, bool bMoveUp) const {
 }
 
 ExpDag::ExpDag(const llvm::MachineRegisterInfo &MRI,
-               const llvm::SIRegisterInfo *SIRI,
-               const SIInstrInfo *SIII, const bool bJoinInput)
+               const llvm::SIRegisterInfo *SIRI, const SIInstrInfo *SIII,
+               const bool bJoinInput)
     : MRI(MRI), SIRI(SIRI), SIII(SIII), bJoinInputToSubExp(bJoinInput) {}
 
 template <typename T>
@@ -196,9 +196,9 @@ template void
 ExpDag::build<DenseSet<MachineInstr *>>(const LiveSet &InputLiveReg,
                                         const LiveSet &OutputLiveReg,
                                         DenseSet<MachineInstr *> &instRange);
-template void ExpDag::build<std::vector<MachineInstr *>>(const LiveSet &InputLiveReg,
-                                               const LiveSet &OutputLiveReg,
-                                               std::vector<MachineInstr *> &instRange);
+template void ExpDag::build<std::vector<MachineInstr *>>(
+    const LiveSet &InputLiveReg, const LiveSet &OutputLiveReg,
+    std::vector<MachineInstr *> &instRange);
 
 void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
                          const SIRegisterInfo *SIRI, const SIInstrInfo *SIII) {
@@ -311,7 +311,8 @@ void ExpDag::buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
             // UserMI should always be in same subExp.
             unsigned UseSubIdx = SubtreeClasses[UseSU->NodeNum];
             if (UseSubIdx != OriginSubIdx) {
-              // When reg has multiple def, it is possible for user def in different subExp.
+              // When reg has multiple def, it is possible for user def in
+              // different subExp.
               if (MRI.getUniqueVRegDef(Reg))
                 llvm::report_fatal_error("user and def in different subExp");
               break;
@@ -470,9 +471,8 @@ void BlockExpDag::buildWithPressure() {
   buildPressure(StartLiveReg, EndLiveReg);
 }
 
-void BlockExpDag::buildAvail(
-    const LiveSet &passThruSet,
-    DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
+void BlockExpDag::buildAvail(const LiveSet &passThruSet,
+                             DenseMap<SUnit *, LiveSet> &DagAvailRegMap) {
   DenseSet<SUnit *> Processed;
 
   DenseSet<SUnit *> WorkList;
@@ -596,10 +596,10 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
       // Using pass thru as base because output of current SU should not
       // affect other output SUs.
       GCNUpwardRPTracker RP(*LIS);
-      RP.reset(BeginMI, &passThruSet, /*After*/true);
+      RP.reset(BeginMI, &passThruSet, /*After*/ true);
       MachineInstr *MI = SU.getInstr();
       if (MI) {
-        RP.reset(*MI, &passThruSet, /*After*/true);
+        RP.reset(*MI, &passThruSet, /*After*/ true);
         RP.recede(*MI);
       }
       DagPressureMap[&SU] = RP.getLiveRegs();
@@ -639,9 +639,9 @@ void BlockExpDag::buildPressure(const LiveSet &StartLiveReg,
         GCNRPTracker::LiveRegSet SuccLive = DagPressureMap[SuccSU];
 
         GCNUpwardRPTracker RP(*LIS);
-        RP.reset(BeginMI, &SuccLive, /*After*/true);
+        RP.reset(BeginMI, &SuccLive, /*After*/ true);
         if (MI) {
-          RP.reset(*MI, &SuccLive, /*After*/true);
+          RP.reset(*MI, &SuccLive, /*After*/ true);
           // Update SuccLive based on MI.
           RP.recede(*MI);
         }
@@ -684,9 +684,7 @@ std::string ExpDag::getGraphNodeLabel(const SUnit *SU) const {
 }
 
 /// Return the label.
-std::string ExpDag::getDAGName() const {
-  return "dag.exp";
-}
+std::string ExpDag::getDAGName() const { return "dag.exp"; }
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
 /// rendered using 'dot'.
@@ -707,7 +705,7 @@ void ExpDag::dump() {
   viewGraph(getDAGName(), "Exp Dag Graph for " + getDAGName());
 }
 
-}
+} // namespace llvm
 
 // Expression Dag dump.
 namespace llvm {
@@ -757,7 +755,8 @@ struct DOTGraphTraits<llvm::ExpDag *> : public DefaultDOTGraphTraits {
     SS << "SU:" << SU->NodeNum;
     return SS.str();
   }
-  static std::string getNodeDescription(const SUnit *SU, const llvm::ExpDag *G) {
+  static std::string getNodeDescription(const SUnit *SU,
+                                        const llvm::ExpDag *G) {
     return G->getGraphNodeLabel(SU);
   }
   static std::string getNodeAttributes(const SUnit *N,
@@ -804,7 +803,9 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   const GCNRPTracker::LiveRegSet outputLive =
       llvm::getLiveRegs(EndSlot, *LIS, MRI);
 
-  auto* ST = &MBB->getParent()->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
+  auto *ST =
+      &MBB->getParent()
+           ->getSubtarget<GCNSubtarget>(); // TODO: Better way to get this.
   if (MBB->empty()) {
     GCNRegPressure MaxPressure = getRegPressure(MRI, outputLive);
     MaxSGPR = MaxPressure.getSGPRNum();
@@ -845,7 +846,7 @@ void getRegBound(llvm::MachineBasicBlock *MBB,
   auto SchedResult = hrbSched(SUnits, BotRoots, MRI, SIRI);
 
   GCNUpwardRPTracker RPTracker(*LIS);
-  RPTracker.reset(MBB->front(), &outputLive, /*After*/true);
+  RPTracker.reset(MBB->front(), &outputLive, /*After*/ true);
   for (auto it = SchedResult.rbegin(); it != SchedResult.rend(); it++) {
     const SUnit *SU = *it;
     if (!SU->isInstr())
@@ -1038,8 +1039,7 @@ void HRB::buildLinear(std::vector<llvm::SUnit> &SUnits) {
   }
 
   LLVM_DEBUG(
-      dbgs() << "Chained Nodes:"; for (SUnit *SU
-                                       : ChainedNodes) {
+      dbgs() << "Chained Nodes:"; for (SUnit *SU : ChainedNodes) {
         dbgs() << " " << SU->NodeNum << "\n";
       } for (int i = 0; i < Lineages.size(); i++) {
         dbgs() << "Lineage" << i << ":";
@@ -1116,8 +1116,7 @@ SUnit *HRB::findHeir(SUnit *SU, std::vector<llvm::SUnit> &SUnits) {
   return Heir;
 }
 
-HRB::Lineage HRB::buildChain(SUnit *Node,
-                             std::vector<llvm::SUnit> &SUnits) {
+HRB::Lineage HRB::buildChain(SUnit *Node, std::vector<llvm::SUnit> &SUnits) {
   HRB::Lineage chain;
   chain.addNode(Node);
   ChainedNodes.insert(Node);
@@ -1241,8 +1240,7 @@ void HRB::buildReachRelation(ArrayRef<SUnit *> BotRoots) {
   }
   ReachMap.erase(&FakeEntry);
 
-  LLVM_DEBUG(for (Lineage &L
-                  : Lineages) {
+  LLVM_DEBUG(for (Lineage &L : Lineages) {
     for (SUnit *SU : L.Nodes) {
       DenseSet<SUnit *> &CurReach = ReachMap[SU];
       dbgs() << SU->NodeNum << " reach: ";
@@ -1703,8 +1701,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
                 return confA > confB;
               });
 
-    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU
-                                              : ReadyList) {
+    LLVM_DEBUG(dbgs() << "ReadyList:\n"; for (SUnit *SU : ReadyList) {
       dbgs() << " " << SU->NodeNum;
     } dbgs() << "\n";);
     SUnit *Candidate = nullptr;
@@ -1754,7 +1751,7 @@ std::vector<const SUnit *> hrbSched(std::vector<SUnit> &SUnits,
         SUnit *SU = *it;
 
         if (!Color.isHead(SU)) {
-            continue;
+          continue;
         }
         Candidate = SU;
         // Remove Candidate from ReadyList.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
index c234f32370..a7d29430b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubExpDag.h
@@ -4,7 +4,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/MC/LaneBitmask.h"
 
-#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
 
 namespace llvm {
 class MachineFunction;
@@ -14,8 +14,7 @@ class SIRegisterInfo;
 class SIInstrInfo;
 class MachineInstr;
 class MachineBasicBlock;
-template<typename GraphType>
-class GraphWriter;
+template <typename GraphType> class GraphWriter;
 class SUnit;
 class IntEqClasses;
 class Twine;
@@ -55,13 +54,12 @@ struct SubExp {
                        const llvm::SIRegisterInfo *SIRI);
   void dump(const llvm::MachineRegisterInfo &MRI,
             const llvm::SIRegisterInfo *SIRI) const;
-  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo* SIRI) const;
+  bool modifiesRegister(unsigned Reg, const llvm::SIRegisterInfo *SIRI) const;
 };
 
 struct ExpDag {
   ExpDag(const llvm::MachineRegisterInfo &MRI, const llvm::SIRegisterInfo *SIRI,
-         const llvm::SIInstrInfo *SIII,
-         const bool bJoinInput);
+         const llvm::SIInstrInfo *SIII, const bool bJoinInput);
   const llvm::MachineRegisterInfo &MRI;
   const llvm::SIRegisterInfo *SIRI;
   const llvm::SIInstrInfo *SIII;
@@ -83,13 +81,14 @@ struct ExpDag {
   std::string getDAGName() const;
   /// Adds custom features for a visualization of the ScheduleDAG.
   void addCustomGraphFeatures(llvm::GraphWriter<ExpDag *> &) const {}
+
 private:
-  template<typename T>
-  void initNodes(const LiveSet &InputLiveReg, T &insts);
+  template <typename T> void initNodes(const LiveSet &InputLiveReg, T &insts);
   void addDataDep(const llvm::SIRegisterInfo *SIRI);
   void addCtrlDep();
   void buildSubExp(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg,
-                   const llvm::SIRegisterInfo *SIRI, const llvm::SIInstrInfo *SIII);
+                   const llvm::SIRegisterInfo *SIRI,
+                   const llvm::SIInstrInfo *SIII);
 };
 
 struct BlockExpDag : public ExpDag {
@@ -103,11 +102,11 @@ struct BlockExpDag : public ExpDag {
   std::vector<SubExp> SubExps;
   void build();
   void buildWithPressure();
+
 private:
   void buildAvail(const LiveSet &passThruSet,
                   llvm::DenseMap<llvm::SUnit *, LiveSet> &DagAvailRegMap);
-  void buildPressure(const LiveSet &StartLiveReg,
-                     const LiveSet &EndLiveReg);
+  void buildPressure(const LiveSet &StartLiveReg, const LiveSet &EndLiveReg);
 };
 
 void getRegBound(llvm::MachineBasicBlock *MBB,
@@ -194,4 +193,4 @@ std::vector<const llvm::SUnit *> hrbSched(std::vector<llvm::SUnit> &SUnits,
                                           const llvm::MachineRegisterInfo &MRI,
                                           const llvm::SIRegisterInfo *SIRI);
 
-}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
index c9172bae2c..09f1d8dfa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVMemDegreeDAG.h
@@ -1,4 +1,5 @@
-//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG --------------===//
+//===-- AMDGPUVMemDegreeDAG.h - Build degree about VMem on DAG
+//--------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,9 +15,9 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
-#include <vector>
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/ScheduleDAG.h"  // For SUnit.
+#include "llvm/CodeGen/ScheduleDAG.h" // For SUnit.
+#include <vector>
 
 namespace llvm {
 class MachineBasicBlock;
@@ -42,7 +43,6 @@ private:
   void addCtrlDep();
 };
 
-
 // Collect height/depth for high latency mem ld, which only update height/depth
 // when cross high latency mem ld. Call the height/depth as VMem degree here.
 // The rule is sample and its user should has different degree.
@@ -60,15 +60,13 @@ private:
 
 class VMemDegreeDAG {
 public:
-  VMemDegreeDAG(std::vector<llvm::SUnit> &Units,
-              const llvm::SIInstrInfo *TII)
+  VMemDegreeDAG(std::vector<llvm::SUnit> &Units, const llvm::SIInstrInfo *TII)
       : SUnits(Units), SIII(TII) {}
   std::vector<llvm::SUnit> &SUnits;
   // InstrInfo.
   const llvm::SIInstrInfo *SIII;
   void build();
 
-
   bool isHighLatency(const llvm::SUnit *SU) const;
   bool isHighLatency(const llvm::MachineInstr *MI) const;
   // height/depth based on Long latency inst.
@@ -79,28 +77,24 @@ public:
   std::vector<unsigned> VMemFullDepth;
   llvm::SmallVector<llvm::SUnit *, 16> VMemSUs;
   llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUs;
-  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16> GroupedVMemSUsByDepth;
-
+  llvm::SmallVector<llvm::SmallVector<llvm::SUnit *, 8>, 16>
+      GroupedVMemSUsByDepth;
 
   void dump();
 
 private:
   static constexpr unsigned kNoReg = -1;
 
-
-  std::pair<unsigned, unsigned> buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
-                            std::vector<unsigned> &VMemDepth, bool bDataOnly);
+  std::pair<unsigned, unsigned>
+  buildVMemDepthHeight(std::vector<unsigned> &VMemHeight,
+                       std::vector<unsigned> &VMemDepth, bool bDataOnly);
   // Compute vmem height/depth.
   void buildVMemDepthHeight();
   void buildVMemDataDepthHeight();
   void groupVmemSUnits();
-
 };
 
-
-
 // Split block based on vmem depth.
 void buildVMemDepth(llvm::MachineBasicBlock &MBB, llvm::VMemDegreeDAG &dag);
 
-}
-
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index cb10df2c34..8debda9032 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1313,7 +1313,7 @@ public:
 
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyDef(int Opc) const override;
-  bool isHighLatencyInstruction(const MachineInstr& MI) const {
+  bool isHighLatencyInstruction(const MachineInstr &MI) const {
     return isHighLatencyDef(MI.getOpcode());
   }
 

``````````

</details>


https://github.com/llvm/llvm-project/pull/126331