[llvm] r186724 - R600: Simplify AMDILCFGStructurize by removing templates and assuming single exit

Ye, Mei Mei.Ye at amd.com
Fri Jul 19 16:34:44 PDT 2013


Hi Evan and Nick

Vincent's changes reminded me that this is probably where Evan and Nick will have preferred my changes to go.
But isn't there something in AMDGPUCFG* that can be code-shared with other GPU targets?  I am not worrying about business competitors.  We have multiple GPU targets inside the same company.  It doesn't make sense if every target repeats similar works.  I think the community should promote code-sharing even though the invocation of the code is not shared.

-Mei


-----Original Message-----
From: llvm-commits-bounces at cs.uiuc.edu [mailto:llvm-commits-bounces at cs.uiuc.edu] On Behalf Of Vincent Lejeune
Sent: Friday, July 19, 2013 2:45 PM
To: llvm-commits at cs.uiuc.edu
Subject: [llvm] r186724 - R600: Simplify AMDILCFGStructurize by removing templates and assuming single exit

Author: vljn
Date: Fri Jul 19 16:45:06 2013
New Revision: 186724

URL: http://llvm.org/viewvc/llvm-project?rev=186724&view=rev
Log:
R600: Simplify AMDILCFGStructurize by removing templates and assuming single exit

Modified:
    llvm/trunk/lib/Target/R600/AMDGPU.h
    llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/R600/AMDILCFGStructurizer.cpp

Modified: llvm/trunk/lib/Target/R600/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPU.h?rev=186724&r1=186723&r2=186724&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/R600/AMDGPU.h Fri Jul 19 16:45:06 2013
@@ -31,7 +31,6 @@ FunctionPass *createR600ExpandSpecialIns
 FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
-FunctionPass *createAMDGPUCFGPreparationPass(TargetMachine &tm);
 FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes

Modified: llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp?rev=186724&r1=186723&r2=186724&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp Fri Jul 19 16:45:06 2013
@@ -160,7 +160,6 @@ bool AMDGPUPassConfig::addPreSched2() {
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    addPass(createAMDGPUCFGPreparationPass(*TM));
     addPass(createAMDGPUCFGStructurizerPass(*TM));
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);

Modified: llvm/trunk/lib/Target/R600/AMDILCFGStructurizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDILCFGStructurizer.cpp?rev=186724&r1=186723&r2=186724&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDILCFGStructurizer.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDILCFGStructurizer.cpp Fri Jul 19 16:45:06 2013
@@ -12,11 +12,13 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "R600InstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -47,12 +49,8 @@ STATISTIC(numSerialPatternMatch,    "CFG
     "matched");
 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     "matched");
-STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
-    "pattern matched");
 STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
     "pattern matched");
-STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
-    "matched");
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
@@ -81,16 +79,6 @@ DEBUG( \
 );
 
 #define INVALIDSCCNUM -1
-#define INVALIDREGNUM 0
-
-template<class LoopinfoT>
-void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
-  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
-       iterEnd = LoopInfo.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->print(OS, 0);
-  }
-}
 
 template<class NodeT>
 void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
@@ -110,40 +98,14 @@ void ReverseVector(SmallVectorImpl<NodeT
 //
 //===----------------------------------------------------------------------===//
 
+
 namespace {
-template<class PassT>
-struct CFGStructTraits {
-};
 
-template <class InstrT>
 class BlockInformation {
 public:
-  bool isRetired;
-  int  sccNum;
-  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
-  //Instructions defining the corresponding successor.
-  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
-};
-
-template <class BlockT, class InstrT, class RegiT>
-class LandInformation {
-public:
-  BlockT *landBlk;
-  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
-                                  //WHILELOOP(thisloop) init before entering
-                                  //thisloop.
-  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
-                                  //WHILELOOP(thisloop) init after entering
-                                  //thisloop.
-  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
-                                     //land block, branch cond on this reg.
-  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
-                                     //endif" after ENDLOOP(thisloop) break
-                                     //outerLoopOf(thisLoop).
-  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
-                                    //endif" after ENDLOOP(thisloop) continue on
-                                    //outerLoopOf(thisLoop).
-  LandInformation() : landBlk(NULL) {}
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
 };
 
 } // end anonymous namespace
@@ -155,1030 +117,1218 @@ public:
 //===----------------------------------------------------------------------===//
 
 namespace {
-// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
-template<class PassT>
-class  CFGStructurizer {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
-  typedef enum {
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
     Not_SinglePath = 0,
     SinglePath_InPath = 1,
     SinglePath_NotInPath = 2
-  } PathToKind;
+  };
 
-public:
-  typedef typename PassT::InstructionType         InstrT;
-  typedef typename PassT::FunctionType            FuncT;
-  typedef typename PassT::DominatortreeType       DomTreeT;
-  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
-  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
-  typedef typename PassT::LoopinfoType            LoopInfoT;
-
-  typedef GraphTraits<FuncT *>                    FuncGTraits;
-  //typedef FuncGTraits::nodes_iterator BlockIterator;
-  typedef typename FuncT::iterator                BlockIterator;
-
-  typedef typename FuncGTraits::NodeType          BlockT;
-  typedef GraphTraits<BlockT *>                   BlockGTraits;
-  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
-  //typedef BlockGTraits::succ_iterator InstructionIterator;
-  typedef typename BlockT::iterator               InstrIterator;
-
-  typedef CFGStructTraits<PassT>                  CFGTraits;
-  typedef BlockInformation<InstrT>                BlockInfo;
-  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
-
-  typedef int                                     RegiT;
-  typedef typename PassT::LoopType                LoopT;
-  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
-        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
-        //landing info for loop break
-  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+  static char ID;
 
-public:
-  CFGStructurizer();
-  ~CFGStructurizer();
+  AMDGPUCFGStructurizer(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm),
+      TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
+      TRI(&TII->getRegisterInfo()) { }
+
+   const char *getPassName() const {
+    return "AMD IL Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addRequired<MachineFunctionAnalysis>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+  }
 
   /// Perform the CFG structurization
-  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  bool run();
 
   /// Perform the CFG preparation
-  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
 
-private:
-  void reversePredicateSetter(typename BlockT::iterator);
-  void   orderBlocks();
-  void   printOrderedBlocks(llvm::raw_ostream &OS);
-  int patternMatch(BlockT *CurBlock);
-  int patternMatchGroup(BlockT *CurBlock);
-
-  int serialPatternMatch(BlockT *CurBlock);
-  int ifPatternMatch(BlockT *CurBlock);
-  int switchPatternMatch(BlockT *CurBlock);
-  int loopendPatternMatch(BlockT *CurBlock);
-  int loopPatternMatch(BlockT *CurBlock);
-
-  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  //int loopWithoutBreak(BlockT *);
-
-  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
-                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
-  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
-                           BlockT *ContBlock, LoopT *contLoop);
-  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
-  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                       BlockT *FalseBlock);
-  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
-                          BlockT *FalseBlock);
-  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                              BlockT *FalseBlock, BlockT **LandBlockPtr);
-  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                                   BlockT *FalseBlock, BlockT *LandBlock,
-                                   bool Detail = false);
-  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
-                          bool AllowSideEntry = true);
-  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
-                        bool AllowSideEntry = true);
-  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
-  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
-
-  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
-                            BlockT *TrueBlock, BlockT *FalseBlock,
-                            BlockT *LandBlock);
-  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
-  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
-                           BlockT *ExitLandBlock, RegiT SetReg);
-  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
-                           RegiT SetReg);
-  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
-                                std::set<BlockT*> &ExitBlockSet,
-                                BlockT *ExitLandBlk);
-  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
-                                BlockTSmallerVector &ExitingBlocks,
-                                BlockTSmallerVector &ExitBlocks);
-  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
-  void removeUnconditionalBranch(BlockT *SrcBlock);
-  void removeRedundantConditionalBranch(BlockT *SrcBlock);
-  void addDummyExitBlock(SmallVectorImpl<BlockT *> &RetBlocks);
-
-  void removeSuccessor(BlockT *SrcBlock);
-  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
-  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
-
-  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
-                          InstrIterator InsertPos);
-
-  void recordSccnum(BlockT *SrcBlock, int SCCNum);
-  int getSCCNum(BlockT *srcBlk);
-
-  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
-  bool isRetiredBlock(BlockT *SrcBlock);
-  bool isActiveLoophead(BlockT *CurBlock);
-  bool needMigrateBlock(BlockT *Block);
-
-  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
-                              BlockTSmallerVector &exitBlocks,
-                              std::set<BlockT*> &ExitBlockSet);
-  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
-  BlockT *getLoopLandBlock(LoopT *LoopRep);
-  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
-
-  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
-
-  bool hasBackEdge(BlockT *curBlock);
-  unsigned getLoopDepth  (LoopT *LoopRep);
-  int countActiveBlock(
-    typename SmallVectorImpl<BlockT *>::const_iterator IterStart,
-    typename SmallVectorImpl<BlockT *>::const_iterator IterEnd);
-    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
-  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+protected:
+  TargetMachine &TM;
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  static unsigned getLoopDepth(MachineLoop *LoopRep);
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originaly from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+      DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+    DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+      DebugLoc DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+      DebugLoc DL);
+  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
+
+  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+      MachineLoop *ContLoop);
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+
+  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
+  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// to post dom a proper fix should go to Dominators.h.
+  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
+      MachineBasicBlock *MBB2);
 
 private:
-  DomTreeT *domTree;
-  PostDomTreeT *postDomTree;
-  LoopInfoT *loopInfo;
-  PassT *passRep;
-  FuncT *funcRep;
-
-  BlockInfoMap blockInfoMap;
-  LoopLandInfoMap loopLandInfoMap;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
-  const AMDGPURegisterInfo *TRI;
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return NULL;
+  return (*It).second;
+}
 
-};  //template class CFGStructurizer
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
+  return LoopRep ? LoopRep->getLoopDepth() : 0;
+}
 
-template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
-  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
 }
 
-template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
-  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
-       E = blockInfoMap.end(); I != E; ++I) {
-    delete I->second;
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
   }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
 }
 
-template<class PassT>
-bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
-                                     const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
+
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
 
-  bool changed = false;
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
 
-  //FIXME: if not reducible flow graph, make it so ???
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
 
-  DEBUG(
-        dbgs() << "AMDGPUCFGStructurizer::prepare\n";
-  );
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+ MachineInstr *MI = MBB->getParent()
+    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
+    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+    DebugLoc DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
+    int NewOpcode, int RegNum) {
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewInstr =
+    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->push_back(NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  DEBUG(
-    dbgs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, dbgs());
-  );
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
 
-  orderBlocks();
-  DEBUG(
-    for (typename SmallVectorImpl<BlockT *>::const_iterator
-        iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
-        iterBlk != iterBlkEnd;
-        ++iterBlk) {
-      (*iterBlk)->dump();
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc().isUnknown() == false)
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
     }
-    dbgs() << "Ordered blocks:\n";
-    printOrderedBlocks(dbgs());
-  );
+  }
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
+  }
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::CONTINUE)
+      return MI;
+  }
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopBreakInstr(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::iterator It = MBB->begin(); (It != MBB->end());
+      ++It) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::PREDICATED_BREAK)
+      return MI;
+  }
+  return NULL;
+}
 
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
+
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
+      It != E; ++It) {
+    MachineInstr *MI = Func->CloneMachineInstr(It);
+    NewMBB->push_back(MI);
+  }
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
 
-  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
-       iterEnd = loopInfo->end();
-       iter != iterEnd; ++iter) {
-    LoopT* loopRep = (*iter);
-    BlockTSmallerVector exitingBlks;
-    loopRep->getExitingBlocks(exitingBlks);
-    
-    if (exitingBlks.size() == 0) {
-      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
-      if (dummyExitBlk != NULL)
-        retBlks.push_back(dummyExitBlk);
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
     }
   }
 
   // Remove unconditional branch instr.
   // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
+  }
 
-  for (typename SmallVectorImpl<BlockT *>::const_iterator
-       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
-       iterBlk != iterEndBlk;
-       ++iterBlk) {
-    BlockT *curBlk = *iterBlk;
-    removeUnconditionalBranch(curBlk);
-    removeRedundantConditionalBranch(curBlk);
-    if (CFGTraits::isReturnBlock(curBlk)) {
-      retBlks.push_back(curBlk);
-    }
-    assert(curBlk->succ_size() <= 2);
-  } //for
-
-  if (retBlks.size() >= 2) {
-    addDummyExitBlock(retBlks);
-    changed = true;
-  }
-
-  return changed;
-} //CFGStructurizer::prepare
-
-template<class PassT>
-bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
-    const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
-
-  //Assume reducible CFG...
-  DEBUG(
-    dbgs() << "AMDGPUCFGStructurizer::run\n";
-    func.viewCFG();
-  );
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
+  }
 
-  domTree = CFGTraits::getDominatorTree(pass);
-  DEBUG(
-    domTree->print(dbgs(), (const llvm::Module*)0);
-  );
+  return Changed;
+}
 
-  postDomTree = CFGTraits::getPostDominatorTree(pass);
-  DEBUG(
-    postDomTree->print(dbgs());
-  );
+bool AMDGPUCFGStructurizer::run() {
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  DEBUG(
-    dbgs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, dbgs());
-  );
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n";FuncRep->viewCFG(););
 
-  orderBlocks();
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
   ReverseVector(orderedBlks);
 #endif
 
-  DEBUG(
-    dbgs() << "Ordered blocks:\n";
-    printOrderedBlocks(dbgs());
-  );
-  int numIter = 0;
-  bool finish = false;
-  BlockT *curBlk;
-  bool makeProgress = false;
-  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
-                                        orderedBlks.end());
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
 
   do {
-    ++numIter;
+    ++NumIter;
     DEBUG(
-      dbgs() << "numIter = " << numIter
-             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
     );
 
-    typename SmallVectorImpl<BlockT *>::const_iterator
-      iterBlk = orderedBlks.begin();
-    typename SmallVectorImpl<BlockT *>::const_iterator
-      iterBlkEnd = orderedBlks.end();
-
-    typename SmallVectorImpl<BlockT *>::const_iterator
-      sccBeginIter = iterBlk;
-    BlockT *sccBeginBlk = NULL;
-    int sccNumBlk = 0;  // The number of active blocks, init to a
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = NULL;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
-    int sccNumIter;     // Number of iteration in this SCC.
+    int SccNumIter;     // Number of iteration in this SCC.
 
-    while (iterBlk != iterBlkEnd) {
-      curBlk = *iterBlk;
+    while (It != E) {
+      MBB = *It;
 
-      if (sccBeginBlk == NULL) {
-        sccBeginIter = iterBlk;
-        sccBeginBlk = curBlk;
-        sccNumIter = 0;
-        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
         DEBUG(
-              dbgs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
               dbgs() << "\n";
         );
       }
 
-      if (!isRetiredBlock(curBlk)) {
-        patternMatch(curBlk);
-      }
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
 
-      ++iterBlk;
+      ++It;
 
-      bool contNextScc = true;
-      if (iterBlk == iterBlkEnd
-          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
         // Just finish one scc.
-        ++sccNumIter;
-        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
-        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
           DEBUG(
-            dbgs() << "Can't reduce SCC " << getSCCNum(curBlk)
-                   << ", sccNumIter = " << sccNumIter;
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
             dbgs() << "doesn't make any progress\n";
           );
-          contNextScc = true;
-        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
-          sccNumBlk = sccRemainedNumBlk;
-          iterBlk = sccBeginIter;
-          contNextScc = false;
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
           DEBUG(
-            dbgs() << "repeat processing SCC" << getSCCNum(curBlk)
-                   << "sccNumIter = " << sccNumIter << "\n";
-            func.viewCFG();
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << "\n";
+            FuncRep->viewCFG();
           );
         } else {
           // Finish the current scc.
-          contNextScc = true;
+          ContNextScc = true;
         }
       } else {
         // Continue on next component in the current scc.
-        contNextScc = false;
+        ContNextScc = false;
       }
 
-      if (contNextScc) {
-        sccBeginBlk = NULL;
-      }
+      if (ContNextScc)
+        SccBeginMBB = NULL;
     } //while, "one iteration" over the function.
 
-    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
-    if (entryBlk->succ_size() == 0) {
-      finish = true;
+    MachineBasicBlock *EntryMBB =
+        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
       DEBUG(
         dbgs() << "Reduce to one block\n";
       );
     } else {
-      int newnumRemainedBlk
-        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
       // consider cloned blocks ??
-      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
-        makeProgress = true;
-        numRemainedBlk = newnumRemainedBlk;
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
       } else {
-        makeProgress = false;
+        MakeProgress = false;
         DEBUG(
           dbgs() << "No progress\n";
         );
       }
     }
-  } while (!finish && makeProgress);
+  } while (!Finish && MakeProgress);
 
   // Misc wrap up to maintain the consistency of the Function representation.
-  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
-  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
-       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    if ((*iterMap).second && (*iterMap).second->isRetired) {
-      assert(((*iterMap).first)->getNumber() != -1);
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
       DEBUG(
-        dbgs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
       );
-      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
     }
-    delete (*iterMap).second;
-  }
-  blockInfoMap.clear();
-
-  // clear loopLandInfoMap
-  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
-       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    delete (*iterMap).second;
+    delete (*It).second;
   }
-  loopLandInfoMap.clear();
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
 
   DEBUG(
-    func.viewCFG();
+    FuncRep->viewCFG();
   );
 
-  if (!finish) {
+  if (!Finish)
     llvm_unreachable("IRREDUCIBL_CF");
-  }
 
   return true;
-} //CFGStructurizer::run
+}
 
-/// Print the ordered Blocks.
-///
-template<class PassT>
-void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
-  size_t i = 0;
-  for (typename SmallVectorImpl<BlockT *>::const_iterator
-      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
-       iterBlk != iterBlkEnd;
-       ++iterBlk, ++i) {
-    os << "BB" << (*iterBlk)->getNumber();
-    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
-    if (i != 0 && i % 10 == 0) {
-      os << "\n";
-    } else {
-      os << " ";
-    }
-  }
-} //printOrderedBlocks
 
-/// Compute the reversed DFS post order of Blocks
-///
-template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
-  int sccNum = 0;
-  BlockT *bb;
-  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
-       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
-    std::vector<BlockT *> &sccNext = *sccIter;
-    for (typename std::vector<BlockT *>::const_iterator
-         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF), E = scc_end(MF);
+      It != E; ++It, ++SccNum) {
+    std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
-      bb = *blockIter;
-      orderedBlks.push_back(bb);
-      recordSccnum(bb, sccNum);
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
     }
   }
 
   //walk through all the block in func to check for unreachable
-  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
-       blockEnd1 = FuncGTraits::nodes_end(funcRep);
-       blockIter1 != blockEnd1; ++blockIter1) {
-    BlockT *bb = &(*blockIter1);
-    sccNum = getSCCNum(bb);
-    if (sccNum == INVALIDSCCNUM) {
-      dbgs() << "unreachable block BB" << bb->getNumber() << "\n";
-    }
+  typedef GraphTraits<MachineFunction *> GTM;
+  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = &(*It);
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
   }
-} //orderBlocks
-
-template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
-  int numMatch = 0;
-  int curMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
 
   DEBUG(
-        dbgs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
   );
 
-  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
-    numMatch += curMatch;
-  }
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
 
   DEBUG(
-        dbgs() << "End patternMatch BB" << curBlk->getNumber()
-      << ", numMatch = " << numMatch << "\n";
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
   );
 
-  return numMatch;
-} //patternMatch
+  return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
 
-template<class PassT>
-int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
-  int numMatch = 0;
-  numMatch += serialPatternMatch(curBlk);
-  numMatch += ifPatternMatch(curBlk);
-  numMatch += loopendPatternMatch(curBlk);
-  numMatch += loopPatternMatch(curBlk);
-  return numMatch;
-}//patternMatchGroup
-
-template<class PassT>
-int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 1) {
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
     return 0;
-  }
 
-  BlockT *childBlk = *curBlk->succ_begin();
-  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
     return 0;
-  }
 
-  mergeSerialBlock(curBlk, childBlk);
+  mergeSerialBlock(MBB, childBlk);
   ++numSerialPatternMatch;
   return 1;
-} //serialPatternMatch
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   //two edges
-  if (curBlk->succ_size() != 2) {
+  if (MBB->succ_size() != 2)
     return 0;
-  }
-
-  if (hasBackEdge(curBlk)) {
+  if (hasBackEdge(MBB))
     return 0;
-  }
-
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
-  if (branchInstr == NULL) {
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
     return 0;
-  }
 
-  assert(CFGTraits::isCondBranch(branchInstr));
+  assert(isCondBranch(BranchMI));
 
-  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
-  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
-  BlockT *landBlk;
-  int cloned = 0;
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  serialPatternMatch(TrueMBB);
+  ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  serialPatternMatch(FalseMBB);
+  ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
 
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
   // TODO: Simplify
-  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
-    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
-    landBlk = *trueBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
-    landBlk = NULL;
-  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
-    landBlk = falseBlk;
-    falseBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && *falseBlk->succ_begin() == trueBlk) {
-    landBlk = trueBlk;
-    trueBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
-    landBlk = *falseBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 1
-    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
-    landBlk = *trueBlk->succ_begin();
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    LandBlk = TrueMBB;
+    TrueMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
   } else {
-    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+    return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   }
 
   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
   // new BB created for landBlk==NULL may introduce new challenge to the
   // reduction process.
-  if (landBlk != NULL &&
-      ((trueBlk && trueBlk->pred_size() > 1)
-      || (falseBlk && falseBlk->pred_size() > 1))) {
-     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
   }
 
-  if (trueBlk && trueBlk->pred_size() > 1) {
-    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
-    ++cloned;
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
   }
 
-  if (falseBlk && falseBlk->pred_size() > 1) {
-    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
-    ++cloned;
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
   }
 
-  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
 
   ++numIfPatternMatch;
 
-  numClonedBlock += cloned;
+  numClonedBlock += Cloned;
 
-  return 1 + cloned;
-} //ifPatternMatch
+  return 1 + Cloned;
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
-  return 0;
-} //switchPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  typename std::vector<LoopT *> nestedLoops;
-  while (loopRep) {
-    nestedLoops.push_back(loopRep);
-    loopRep = loopRep->getParentLoop();
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::vector<MachineLoop *> NestedLoops;
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
+      It != E; ++It) {
+    df_iterator<MachineLoop *> LpIt = df_begin(*It),
+        LpE = df_end(*It);
+    for (; LpIt != LpE; ++LpIt)
+      NestedLoops.push_back(*LpIt);
   }
-
-  if (nestedLoops.size() == 0) {
+  if (NestedLoops.size() == 0)
     return 0;
-  }
 
   // Process nested loop outside->inside, so "continue" to a outside loop won't
   // be mistaken as "break" of the current loop.
-  int num = 0;
-  for (typename std::vector<LoopT *>::reverse_iterator
-       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
-       iter != iterEnd; ++iter) {
-    loopRep = *iter;
-
-    if (getLoopLandBlock(loopRep) != NULL) {
+  int Num = 0;
+  for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(),
+      E = NestedLoops.rend(); It != E; ++It) {
+    MachineLoop *ExaminedLoop = *It;
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
       continue;
-    }
-
-    BlockT *loopHeader = loopRep->getHeader();
-
-    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
-
-    if (numBreak == -1) {
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
       break;
-    }
-
-    int numCont = loopcontPatternMatch(loopRep, loopHeader);
-    num += numBreak + numCont;
+    Num += NumBreak;
   }
+  return Num;
+}
 
-  return num;
-} //loopendPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 0) {
-    return 0;
-  }
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
 
-  int numLoop = 0;
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-    if (loopLand) {
-      BlockT *landBlk = loopLand->landBlk;
-      assert(landBlk);
-      if (!isRetiredBlock(landBlk)) {
-        mergeLooplandBlock(curBlk, loopLand);
-        ++numLoop;
-      }
+int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
+    MachineBasicBlock *LoopHeader) {
+  int NumCont = 0;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
+  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
+  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
+      E = GTIM::child_end(LoopHeader);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (LoopRep->contains(MBB)) {
+      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
+                          LoopHeader, LoopRep);
+      ContMBB.push_back(MBB);
+      ++NumCont;
     }
-    loopRep = loopRep->getParentLoop();
-  }
-
-  numLoopPatternMatch += numLoop;
-
-  return numLoop;
-} //loopPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
-                                                  BlockT *loopHeader) {
-  BlockTSmallerVector exitingBlks;
-  loopRep->getExitingBlocks(exitingBlks);
-
-  DEBUG(
-    dbgs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
-  );
-
-  if (exitingBlks.size() == 0) {
-    setLoopLandBlock(loopRep);
-    return 0;
   }
 
-  // Compute the corresponding exitBlks and exit block set.
-  BlockTSmallerVector exitBlks;
-  std::set<BlockT *> exitBlkSet;
-  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
-       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *exitingBlk = *iter;
-    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-    exitBlks.push_back(exitBlk);
-    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
+      E = ContMBB.end(); It != E; ++It) {
+    (*It)->removeSuccessor(LoopHeader);
   }
 
-  assert(exitBlkSet.size() > 0);
-  assert(exitBlks.size() == exitingBlks.size());
+  numLoopcontPatternMatch += NumCont;
 
-  DEBUG(
-    dbgs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
-  );
-
-  // Find exitLandBlk.
-  BlockT *exitLandBlk = NULL;
-  int numCloned = 0;
-  int numSerial = 0;
-
-  if (exitBlkSet.size() == 1) {
-    exitLandBlk = *exitBlkSet.begin();
-  } else {
-    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
-
-    if (exitLandBlk == NULL) {
-      return -1;
-    }
-
-    bool allInPath = true;
-    bool allNotInPath = true;
-    for (typename std::set<BlockT*>::const_iterator
-         iter = exitBlkSet.begin(),
-         iterEnd = exitBlkSet.end();
-         iter != iterEnd; ++iter) {
-      BlockT *exitBlk = *iter;
-
-      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
-      DEBUG(
-        dbgs() << "BB" << exitBlk->getNumber()
-               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
-               << pathKind << "\n";
-      );
-
-      allInPath = allInPath && (pathKind == SinglePath_InPath);
-      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
-
-      if (!allInPath && !allNotInPath) {
-        DEBUG(
-              dbgs() << "singlePath check fail\n";
-        );
-        return -1;
-      }
-    } // check all exit blocks
-
-    if (allNotInPath) {
-
-      // TODO: Simplify, maybe separate function?
-      LoopT *parentLoopRep = loopRep->getParentLoop();
-      BlockT *parentLoopHeader = NULL;
-      if (parentLoopRep)
-        parentLoopHeader = parentLoopRep->getHeader();
-
-      if (exitLandBlk == parentLoopHeader &&
-          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
-                                               loopRep,
-                                               exitBlkSet,
-                                               exitLandBlk)) != NULL) {
-        DEBUG(
-          dbgs() << "relocateLoopcontBlock success\n";
-        );
-      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
-                                                      exitingBlks,
-                                                      exitBlks)) != NULL) {
-        DEBUG(
-          dbgs() << "insertEndbranchBlock success\n";
-        );
-      } else {
-        DEBUG(
-          dbgs() << "loop exit fail\n";
-        );
-        return -1;
-      }
-    }
-
-    // Handle side entry to exit path.
-    exitBlks.clear();
-    exitBlkSet.clear();
-    for (typename BlockTSmallerVector::iterator iterExiting =
-           exitingBlks.begin(),
-         iterExitingEnd = exitingBlks.end();
-         iterExiting != iterExitingEnd; ++iterExiting) {
-      BlockT *exitingBlk = *iterExiting;
-      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-      BlockT *newExitBlk = exitBlk;
-
-      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
-        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
-        ++numCloned;
-      }
-
-      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
-
-      exitBlks.push_back(newExitBlk);
-      exitBlkSet.insert(newExitBlk);
-    }
-
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      numSerial += serialPatternMatch(exitBlk);
-    }
-
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      if (exitBlk->pred_size() > 1) {
-        if (exitBlk != exitLandBlk) {
-          return -1;
-        }
-      } else {
-        if (exitBlk != exitLandBlk &&
-            (exitBlk->succ_size() != 1 ||
-            *exitBlk->succ_begin() != exitLandBlk)) {
-          return -1;
-        }
-      }
-    }
-  } // else
-
-  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+  return NumCont;
+}
 
-  // Fold break into the breaking block. Leverage across level breaks.
-  assert(exitingBlks.size() == exitBlks.size());
-  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
-       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
-       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
-    BlockT *exitBlk = *iterExit;
-    BlockT *exitingBlk = *iterExiting;
-    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
-    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
-    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
-  }
-
-  int numBreak = static_cast<int>(exitingBlks.size());
-  numLoopbreakPatternMatch += numBreak;
-  numClonedBlock += numCloned;
-  return numBreak + numSerial + numCloned;
-} //loopbreakPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
-                                                 BlockT *loopHeader) {
-  int numCont = 0;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
-  for (typename InvBlockGTraits::ChildIteratorType iter =
-       InvBlockGTraits::child_begin(loopHeader),
-       iterEnd = InvBlockGTraits::child_end(loopHeader);
-       iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    if (loopRep->contains(curBlk)) {
-      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
-                          loopHeader, loopRep);
-      contBlk.push_back(curBlk);
-      ++numCont;
-    }
-  }
 
-  for (typename SmallVectorImpl<BlockT *>::iterator
-       iter = contBlk.begin(), iterEnd = contBlk.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->removeSuccessor(loopHeader);
-  }
-
-  numLoopcontPatternMatch += numCont;
-
-  return numCont;
-} //loopcontPatternMatch
-
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
-                                                         BlockT *src2Blk) {
-  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
-  // same loop with LoopLandInfo without explicitly keeping track of
-  // loopContBlks and loopBreakBlks, this is a method to get the information.
-  //
-  if (src1Blk->succ_size() == 0) {
-    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
-    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
-      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-      if (theEntry != NULL) {
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
         DEBUG(
           dbgs() << "isLoopContBreakBlock yes src1 = BB"
-                 << src1Blk->getNumber()
-                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
         );
         return true;
       }
     }
   }
   return false;
-}  //isSameloopDetachedContbreak
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
-                                             BlockT *trueBlk,
-                                             BlockT *falseBlk) {
-  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
-  if (num == 0) {
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
     DEBUG(
       dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
     );
-    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   }
-  return num;
+  return Num;
 }
 
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
-                                                BlockT *trueBlk,
-                                                BlockT *falseBlk) {
-  int num = 0;
-  BlockT *downBlk;
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
 
   //trueBlk could be the common post dominator
-  downBlk = trueBlk;
+  DownBlk = TrueMBB;
 
   DEBUG(
-    dbgs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
-           << " true = BB" << trueBlk->getNumber()
-           << ", numSucc=" << trueBlk->succ_size()
-           << " false = BB" << falseBlk->getNumber() << "\n";
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
   );
 
-  while (downBlk) {
+  while (DownBlk) {
     DEBUG(
-      dbgs() << "check down = BB" << downBlk->getNumber();
+      dbgs() << "check down = BB" << DownBlk->getNumber();
     );
 
-    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
       DEBUG(
         dbgs() << " working\n";
       );
 
-      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
-      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
 
-      numClonedBlock += num;
-      num += serialPatternMatch(*headBlk->succ_begin());
-      num += serialPatternMatch(*(++headBlk->succ_begin()));
-      num += ifPatternMatch(headBlk);
-      assert(num > 0);
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*(++HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
 
       break;
     }
     DEBUG(
       dbgs() << " not working\n";
     );
-    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
   } // walk down the postDomTree
 
-  return num;
-} //handleJumpintoIf
+  return Num;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
-                                                         BlockT *trueBlk,
-                                                         BlockT *falseBlk,
-                                                         BlockT *landBlk,
-                                                         bool detail) {
-  dbgs() << "head = BB" << headBlk->getNumber()
-         << " size = " << headBlk->size();
-  if (detail) {
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
     dbgs() << "\n";
-    headBlk->print(dbgs());
+    HeadMBB->print(dbgs());
     dbgs() << "\n";
   }
 
-  if (trueBlk) {
-    dbgs() << ", true = BB" << trueBlk->getNumber() << " size = "
-           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
-    if (detail) {
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
       dbgs() << "\n";
-      trueBlk->print(dbgs());
+      TrueMBB->print(dbgs());
       dbgs() << "\n";
     }
   }
-  if (falseBlk) {
-    dbgs() << ", false = BB" << falseBlk->getNumber() << " size = "
-           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
-    if (detail) {
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
       dbgs() << "\n";
-      falseBlk->print(dbgs());
+      FalseMBB->print(dbgs());
       dbgs() << "\n";
     }
   }
-  if (landBlk) {
-    dbgs() << ", land = BB" << landBlk->getNumber() << " size = "
-           << landBlk->size() << " numPred = " << landBlk->pred_size();
-    if (detail) {
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
       dbgs() << "\n";
-      landBlk->print(dbgs());
+      LandMBB->print(dbgs());
       dbgs() << "\n";
     }
   }
 
     dbgs() << "\n";
-} //showImproveSimpleJumpintoIf
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
-                                                    BlockT *trueBlk,
-                                                    BlockT *falseBlk,
-                                                    BlockT **plandBlk) {
-  bool migrateTrue = false;
-  bool migrateFalse = false;
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
 
-  BlockT *landBlk = *plandBlk;
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
 
-  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
-         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
 
-  if (trueBlk == falseBlk) {
+  if (TrueMBB == FalseMBB)
     return 0;
-  }
 
-  migrateTrue = needMigrateBlock(trueBlk);
-  migrateFalse = needMigrateBlock(falseBlk);
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
 
-  if (!migrateTrue && !migrateFalse) {
+  if (!MigrateTrue && !MigrateFalse)
     return 0;
-  }
 
   // If we need to migrate either trueBlk and falseBlk, migrate the rest that
   // have more than one predecessors.  without doing this, its predecessor
   // rather than headBlk will have undefined value in initReg.
-  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
-    migrateTrue = true;
-  }
-  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
-    migrateFalse = true;
-  }
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
 
   DEBUG(
     dbgs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
   );
 
   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
@@ -1193,205 +1343,142 @@ int CFGStructurizer<PassT>::improveSimpl
   // add initReg = initVal to headBlk
 
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned initReg =
-    funcRep->getRegInfo().createVirtualRegister(I32RC);
-  if (!migrateTrue || !migrateFalse) {
-    int initVal = migrateTrue ? 0 : 1;
-    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
-  }
-
-  int numNewBlk = 0;
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  if (!MigrateTrue || !MigrateFalse)
+    llvm_unreachable("Extra register needed to handle CFG");
+
+  int NumNewBlk = 0;
+
+  if (!LandBlk) {
+    LandBlk = HeadMBB->getParent()->CreateMachineBasicBlock();
+    HeadMBB->getParent()->push_back(LandBlk);  //insert to function
 
-  if (landBlk == NULL) {
-    landBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(landBlk);  //insert to function
-
-    if (trueBlk) {
-      trueBlk->addSuccessor(landBlk);
+    if (TrueMBB) {
+      TrueMBB->addSuccessor(LandBlk);
     } else {
-      headBlk->addSuccessor(landBlk);
+      HeadMBB->addSuccessor(LandBlk);
     }
 
-    if (falseBlk) {
-      falseBlk->addSuccessor(landBlk);
+    if (FalseMBB) {
+      FalseMBB->addSuccessor(LandBlk);
     } else {
-      headBlk->addSuccessor(landBlk);
+      HeadMBB->addSuccessor(LandBlk);
     }
 
-    numNewBlk ++;
+    NumNewBlk ++;
   }
 
-  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  typename BlockT::iterator insertPos =
-    CFGTraits::getInstrPos
-    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
-
-  if (landBlkHasOtherPred) {
-    unsigned immReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
-    unsigned cmpResReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-
-    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
-                                        initReg, immReg);
-    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
-                                      AMDGPU::IF_PREDICATE_SET, passRep,
-                                      cmpResReg, DebugLoc());
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    llvm_unreachable("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
   }
 
-  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
-                                    passRep, initReg, DebugLoc());
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
 
-  if (migrateTrue) {
-    migrateInstruction(trueBlk, landBlk, insertPos);
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 1).
-    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
-  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+  insertInstrBefore(I, AMDGPU::ELSE);
 
-  if (migrateFalse) {
-    migrateInstruction(falseBlk, landBlk, insertPos);
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 0)
-    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
 
-  if (landBlkHasOtherPred) {
+  if (LandBlkHasOtherPred) {
     // add endif
-    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+    insertInstrBefore(I, AMDGPU::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
-    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
-         ++predIter) {
-      BlockT *curBlk = *predIter;
-      if (curBlk != trueBlk && curBlk != falseBlk) {
-        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
-      }
-    } //for
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        llvm_unreachable("Extra register needed to handle CFG");
+    }
   }
   DEBUG(
     dbgs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
   );
 
   // update landBlk
-  *plandBlk = landBlk;
+  *LandMBBPtr = LandBlk;
 
-  return numNewBlk;
-} //improveSimpleJumpintoIf
+  return NumNewBlk;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
-                                              LoopT *exitingLoop,
-                                             BlockT *exitBlk,
-                                              LoopT *exitLoop,
-                                             BlockT *landBlk) {
-  DEBUG(
-    dbgs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
-           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
-  );
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+    MachineLoop *ContLoop) {
+  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
+               << " header = BB" << ContMBB->getNumber() << "\n";
+        dbgs() << "Trying to continue loop-depth = "
+               << getLoopDepth(ContLoop)
+               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
+  settleLoopcontBlock(ContingMBB, ContMBB);
+}
 
-  RegiT initReg = INVALIDREGNUM;
-  if (exitingLoop != exitLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopBreakInitReg(exitLoop, initReg);
-    while (exitingLoop != exitLoop && exitingLoop) {
-      addLoopBreakOnReg(exitingLoop, initReg);
-      exitingLoop = exitingLoop->getParentLoop();
-    }
-    assert(exitingLoop == exitLoop);
-  }
-
-  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
-
-} //handleLoopbreak
-
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
-                                                  LoopT *contingLoop,
-                                                 BlockT *contBlk,
-                                                  LoopT *contLoop) {
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
   DEBUG(
-    dbgs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
-           << " header = BB" << contBlk->getNumber() << "\n";
-
-    dbgs() << "Trying to continue loop-depth = "
-           << getLoopDepth(contLoop)
-           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
   );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
-  RegiT initReg = INVALIDREGNUM;
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (contingLoop != contLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopContInitReg(contLoop, initReg);
-    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
-      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
-      contingLoop = contingLoop->getParentLoop();
-    }
-    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
-    addLoopContOnReg(contingLoop, initReg);
-  }
+  DstMBB->removeSuccessor(SrcMBB);
+  cloneSuccessorList(DstMBB, SrcMBB);
 
-  settleLoopcontBlock(contingBlk, contBlk, initReg);
-} //handleLoopcontBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
-  DEBUG(
-    dbgs() << "serialPattern BB" << dstBlk->getNumber()
-           << " <= BB" << srcBlk->getNumber() << "\n";
-  );
-  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
-
-  dstBlk->removeSuccessor(srcBlk);
-  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
 
-  removeSuccessor(srcBlk);
-  retireBlock(dstBlk, srcBlk);
-} //mergeSerialBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
-                                                  BlockT *curBlk,
-                                                  BlockT *trueBlk,
-                                                  BlockT *falseBlk,
-                                                  BlockT *landBlk) {
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   DEBUG(
-    dbgs() << "ifPattern BB" << curBlk->getNumber();
+    dbgs() << "ifPattern BB" << MBB->getNumber();
     dbgs() << "{  ";
-    if (trueBlk) {
-      dbgs() << "BB" << trueBlk->getNumber();
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
     }
     dbgs() << "  } else ";
     dbgs() << "{  ";
-    if (falseBlk) {
-      dbgs() << "BB" << falseBlk->getNumber();
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
     }
     dbgs() << "  }\n ";
     dbgs() << "landBlock: ";
-    if (landBlk == NULL) {
+    if (!LandMBB) {
       dbgs() << "NULL";
     } else {
-      dbgs() << "BB" << landBlk->getNumber();
+      dbgs() << "BB" << LandMBB->getNumber();
     }
     dbgs() << "\n";
   );
 
-  int oldOpcode = branchInstr->getOpcode();
-  DebugLoc branchDL = branchInstr->getDebugLoc();
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
 
 //    transform to
 //    if cond
@@ -1401,1645 +1488,390 @@ void CFGStructurizer<PassT>::mergeIfthen
 //    endif
 //    landBlk
 
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(curBlk, branchInstr);
-  CFGTraits::insertCondBranchBefore(branchInstrPos,
-                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
-                                    passRep,
-                                    branchDL);
-
-  if (trueBlk) {
-    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
-    curBlk->removeSuccessor(trueBlk);
-    if (landBlk && trueBlk->succ_size()!=0) {
-      trueBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, trueBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
-
-  if (falseBlk) {
-    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
-                   falseBlk->end());
-    curBlk->removeSuccessor(falseBlk);
-    if (landBlk && falseBlk->succ_size() != 0) {
-      falseBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, falseBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
-
-  branchInstr->eraseFromParent();
-
-  if (landBlk && trueBlk && falseBlk) {
-    curBlk->addSuccessor(landBlk);
-  }
-
-} //mergeIfthenelseBlock
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
+
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
+  }
+
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
+  }
+  insertInstrBefore(I, AMDGPU::ENDIF);
+
+  BranchMI->eraseFromParent();
+
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
-                                                LoopLandInfo *loopLand) {
-  BlockT *landBlk = loopLand->landBlk;
-
-  DEBUG(
-    dbgs() << "loopPattern header = BB" << dstBlk->getNumber()
-           << " land = BB" << landBlk->getNumber() << "\n";
-  );
+}
 
-  // Loop contInitRegs are init at the beginning of the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->contInitRegs.begin(),
-       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
 
   /* we last inserterd the DebugLoc in the
-   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
+   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current
+   * dstBlk.
    * search for the DebugLoc in the that statement.
    * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
-  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+  MachineInstr *LoopBreakInstr = getLoopBreakInstr(DstBlk);
+  DebugLoc DLBreak = (LoopBreakInstr) ? LoopBreakInstr->getDebugLoc() :
+      DebugLoc();
 
-  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
-  // Loop breakInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakInitRegs.begin(),
-       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
-  // Loop endbranchInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->endbranchInitRegs.begin(),
-       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DLBreak);
 
-  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+  /* we last inserterd the DebugLoc in the continue statement in the current
+   * dstBlk.
    * search for the DebugLoc in the continue statement.
    * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
-  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
-
-  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
-  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
-  // loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakOnRegs.begin(),
-       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
-                                   *iter);
-  }
-
-  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
-  // loop.
-  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
-       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
-                                   passRep, *iter);
-  }
-
-  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
-
-  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
-       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
-    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
-  }
-
-  removeSuccessor(landBlk);
-  retireBlock(dstBlk, landBlk);
-} //mergeLooplandBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
-  while (I--) {
-    if (I->getOpcode() == AMDGPU::PRED_X) {
-      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
-      case OPCODE_IS_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
-        return;
-      case OPCODE_IS_NOT_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
-        return;
-      case OPCODE_IS_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
-        return;
-      case OPCODE_IS_NOT_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
-        return;
-      default:
-        llvm_unreachable("PRED_X Opcode invalid!");
-      }
-    }
-  }
+  MachineInstr *ContinueInstr = getContinueInstr(DstBlk);
+  DebugLoc DLContinue = (ContinueInstr) ? ContinueInstr->getDebugLoc() :
+      DebugLoc();
+
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DLContinue);
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
-                                                 BlockT *exitBlk,
-                                                 BlockT *exitLandBlk,
-                                                 RegiT  setReg) {
-  DEBUG(
-    dbgs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
-           << " exit = BB" << exitBlk->getNumber()
-           << " land = BB" << exitLandBlk->getNumber() << "\n";
-  );
-
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
-  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-
-  DebugLoc DL = branchInstr->getDebugLoc();
-
-  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-
-  //    transform exitingBlk to
-  //    if ( ) {
-  //       exitBlk (if exitBlk != exitLandBlk)
-  //       setReg = 1
-  //       break
-  //    }endif
-  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
-
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(exitingBlk, branchInstr);
-
-  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
-    //break_logical
-
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstrPos);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-  } else {
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstr);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-    if (exitBlk != exitLandBlk) {
-      //splice is insert-before ...
-      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
-                         exitBlk->end());
-    }
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-    }
-    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
-  } //if_logical
 
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I);
+  insertCondBranchBefore(I, AMDGPU::PREDICATED_BREAK, DL);
   //now branchInst can be erase safely
-  branchInstr->eraseFromParent();
-
+  BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
-  exitingBlk->removeSuccessor(exitBlk);
-  if (exitBlk != exitLandBlk) {
-    //splice is insert-before ...
-    exitBlk->removeSuccessor(exitLandBlk);
-    retireBlock(exitingBlk, exitBlk);
-  }
-
-} //mergeLoopbreakBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
-                                                 BlockT *contBlk,
-                                                 RegiT   setReg) {
-  DEBUG(
-    dbgs() << "settleLoopcontBlock conting = BB"
-           << contingBlk->getNumber()
-           << ", cont = BB" << contBlk->getNumber() << "\n";
-  );
-
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
-  if (branchInstr) {
-    assert(CFGTraits::isCondBranch(branchInstr));
-    typename BlockT::iterator branchInstrPos =
-      CFGTraits::getInstrPos(contingBlk, branchInstr);
-    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-    int oldOpcode = branchInstr->getOpcode();
-    DebugLoc DL = branchInstr->getDebugLoc();
-
-    //    transform contingBlk to
-    //     if () {
-    //          move instr after branchInstr
-    //          continue
-    //        or
-    //          setReg = 1
-    //          break
-    //     }endif
-    //     successor = {orgSuccessor(contingBlk) - loopHeader}
-
-    bool useContinueLogical = 
-      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
-
-    if (useContinueLogical == false) {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
-                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
-
-      if (setReg != INVALIDREGNUM) {
-        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
-      } else {
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
-      }
+  ExitingMBB->removeSuccessor(LandMBB);
+}
 
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (UseContinueLogical == false) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
     } else {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
-                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
     }
 
-    branchInstr->eraseFromParent();
+    MI->eraseFromParent();
   } else {
     // if we've arrived here then we've already erased the branch instruction
-    // travel back up the basic block to see the last reference of our debug location
-    // we've just inserted that reference here so it should be representative
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    } else {
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    }
-  } //else
-
-} //settleLoopcontBlock
-
-// BBs in exitBlkSet are determined as in break-path for loopRep,
-// before we can put code for BBs as inside loop-body for loopRep
-// check whether those BBs are determined as cont-BB for parentLoopRep
-// earlier.
-// If so, generate a new BB newBlk
-//    (1) set newBlk common successor of BBs in exitBlkSet
-//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
-//    (3) generate continue-instr in newBlk
-//
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
-                                              LoopT *loopRep,
-                                              std::set<BlockT *> &exitBlkSet,
-                                              BlockT *exitLandBlk) {
-  std::set<BlockT *> endBlkSet;
-
-
-
-  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
-       iterEnd = exitBlkSet.end();
-       iter != iterEnd; ++iter) {
-    BlockT *exitBlk = *iter;
-    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
-
-    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
-      return NULL;
-
-    endBlkSet.insert(endBlk);
-  }
-
-  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(newBlk);  //insert to function
-  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
-  SHOWNEWBLK(newBlk, "New continue block: ");
-
-  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
-       iterEnd = endBlkSet.end();
-       iter != iterEnd; ++iter) {
-      BlockT *endBlk = *iter;
-      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
-      if (contInstr) {
-        contInstr->eraseFromParent();
-      }
-      endBlk->addSuccessor(newBlk);
-      DEBUG(
-        dbgs() << "Add new continue Block to BB"
-               << endBlk->getNumber() << " successors\n";
-      );
-  }
-
-  return newBlk;
-} //relocateLoopcontBlock
-
-
-// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
-// LoopLandBlock. This BB branch on the loop endBranchInit register to the
-// pathes corresponding to the loop exiting branches.
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
-                                              BlockTSmallerVector &exitingBlks,
-                                              BlockTSmallerVector &exitBlks) {
-  const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  RegiT endBranchReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
-  assert(endBranchReg >= 0);
-
-  // reg = 0 before entering the loop
-  addLoopEndbranchInitReg(loopRep, endBranchReg);
-
-  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
-  assert(numBlks >=2 && numBlks == exitBlks.size());
-
-  BlockT *preExitingBlk = exitingBlks[0];
-  BlockT *preExitBlk = exitBlks[0];
-  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(preBranchBlk);  //insert to function
-  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
-
-  BlockT *newLandBlk = preBranchBlk;
-
-      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
-        newLandBlk);
-  preExitingBlk->removeSuccessor(preExitBlk);
-  preExitingBlk->addSuccessor(newLandBlk);
-
-  //it is redundant to add reg = 0 to exitingBlks[0]
-
-  // For 1..n th exiting path (the last iteration handles two pathes) create the
-  // branch to the previous path and the current path.
-  for (uint32_t i = 1; i < numBlks; ++i) {
-    BlockT *curExitingBlk = exitingBlks[i];
-    BlockT *curExitBlk = exitBlks[i];
-    BlockT *curBranchBlk;
-
-    if (i == numBlks - 1) {
-      curBranchBlk = curExitBlk;
-    } else {
-      curBranchBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(curBranchBlk);  //insert to function
-      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
-    }
-
-    // Add reg = i to exitingBlks[i].
-    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
-                                       endBranchReg, i);
-
-    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
-    // (exitingBlks[i], newLandBlk).
-    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
-                                          newLandBlk);
-    curExitingBlk->removeSuccessor(curExitBlk);
-    curExitingBlk->addSuccessor(newLandBlk);
-
-    // add to preBranchBlk the branch instruction:
-    // if (endBranchReg == preVal)
-    //    preExitBlk
-    // else
-    //    curBranchBlk
-    //
-    // preValReg = i - 1
-
-  DebugLoc DL;
-  RegiT preValReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
-
-  preBranchBlk->insert(preBranchBlk->begin(),
-                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
-                       i - 1));
-
-  // condResReg = (endBranchReg == preValReg)
-    RegiT condResReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
-      .addReg(endBranchReg).addReg(preValReg);
-
-    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
-      .addMBB(preExitBlk).addReg(condResReg);
-
-    preBranchBlk->addSuccessor(preExitBlk);
-    preBranchBlk->addSuccessor(curBranchBlk);
-
-    // Update preExitingBlk, preExitBlk, preBranchBlk.
-    preExitingBlk = curExitingBlk;
-    preExitBlk = curExitBlk;
-    preBranchBlk = curBranchBlk;
-
-  }  //end for 1 .. n blocks
-
-  return newLandBlk;
-} //addLoopEndbranchBlock
-
-template<class PassT>
-typename CFGStructurizer<PassT>::PathToKind
-CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
-                                     bool allowSideEntry) {
-  assert(dstBlk);
-
-  if (srcBlk == dstBlk) {
-    return SinglePath_InPath;
-  }
-
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == dstBlk) {
-      return SinglePath_InPath;
-    }
-
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return Not_SinglePath;
-    }
-  }
-
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return SinglePath_NotInPath;
-  }
-
-  return Not_SinglePath;
-} //singlePathTo
-
-// If there is a single path from srcBlk to dstBlk, return the last block before
-// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
-// last block in the path Otherwise, return NULL
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
-                                      bool allowSideEntry) {
-  assert(dstBlk);
-
-  if (srcBlk == dstBlk) {
-    return srcBlk;
-  }
-
-  if (srcBlk->succ_size() == 0) {
-    return srcBlk;
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
   }
+}
 
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    BlockT *preBlk = srcBlk;
-
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == NULL) {
-      return preBlk;
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
     }
 
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return NULL;
-    }
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
   }
 
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return srcBlk;
-  }
-
-  return NULL;
-
-} //singlePathEnd
+  return Cloned;
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
-                                               BlockT *dstBlk) {
-  int cloned = 0;
-  assert(preBlk->isSuccessor(srcBlk));
-  while (srcBlk && srcBlk != dstBlk) {
-    assert(srcBlk->succ_size() == 1);
-    if (srcBlk->pred_size() > 1) {
-      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
-      ++cloned;
-    }
-
-    preBlk = srcBlk;
-    srcBlk = *srcBlk->succ_begin();
-  }
-
-  return cloned;
-} //cloneOnSideEntryTo
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
-                                                 BlockT *predBlk) {
-  assert(predBlk->isSuccessor(curBlk) &&
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
          "succBlk is not a prececessor of curBlk");
 
-  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
-  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  predBlk->removeSuccessor(curBlk);
-  predBlk->addSuccessor(cloneBlk);
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
 
   // add all successor to cloneBlk
-  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+  cloneSuccessorList(CloneMBB, MBB);
 
-  numClonedInstr += curBlk->size();
+  numClonedInstr += MBB->size();
 
   DEBUG(
     dbgs() << "Cloned block: " << "BB"
-           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
   );
 
-  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
-
-  return cloneBlk;
-} //cloneBlockForPredecessor
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
-                                               BlockT *exitingBlk) {
-  BlockT *exitBlk = NULL;
-
-  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
-       iterSuccEnd = exitingBlk->succ_end();
-       iterSucc != iterSuccEnd; ++iterSucc) {
-    BlockT *curBlk = *iterSucc;
-    if (!loopRep->contains(curBlk)) {
-      assert(exitBlk == NULL);
-      exitBlk = curBlk;
-    }
-  }
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
 
-  assert(exitBlk != NULL);
-
-  return exitBlk;
-} //exitingBlock2ExitBlock
+  return CloneMBB;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
-                                                BlockT *dstBlk,
-                                                InstrIterator insertPos) {
-  InstrIterator spliceEnd;
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
   //look for the input branchinstr, not the AMDGPU branchinstr
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-  if (branchInstr == NULL) {
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
     DEBUG(
       dbgs() << "migrateInstruction don't see branch instr\n" ;
     );
-    spliceEnd = srcBlk->end();
+    SpliceEnd = SrcMBB->end();
   } else {
     DEBUG(
       dbgs() << "migrateInstruction see branch instr\n" ;
-      branchInstr->dump();
+      BranchMI->dump();
     );
-    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+    SpliceEnd = BranchMI;
   }
   DEBUG(
-    dbgs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
   );
 
   //splice insert before insertPos
-  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
 
   DEBUG(
-    dbgs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
   );
-} //migrateInstruction
+}
 
-// normalizeInfiniteLoopExit change
-//   B1:
-//        uncond_br LoopHeader
-//
-// to
-//   B1:
-//        cond_br 1 LoopHeader dummyExit
-// and return the newly added dummy exit block
-// 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
-  BlockT *loopHeader;
-  BlockT *loopLatch;
-  loopHeader = LoopRep->getHeader();
-  loopLatch = LoopRep->getLoopLatch();
-  BlockT *dummyExitBlk = NULL;
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (loopHeader!=NULL && loopLatch!=NULL) {
-    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
-    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
-      dummyExitBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(dummyExitBlk);  //insert to function
-      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-
-      DEBUG(dbgs() << "Old branch instr: " << *branchInstr << "\n";);
-
-      typename BlockT::iterator insertPos =
-        CFGTraits::getInstrPos(loopLatch, branchInstr);
-      unsigned immReg =
-        funcRep->getRegInfo().createVirtualRegister(I32RC);
-      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
-      InstrT *newInstr = 
-        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
-      MachineInstrBuilder MIB(*funcRep, newInstr);
-      MIB.addMBB(loopHeader);
-      MIB.addReg(immReg, false);
-
-      SHOWNEWINSTR(newInstr);
-
-      branchInstr->eraseFromParent();
-      loopLatch->addSuccessor(dummyExitBlk);
-    }
-  }
 
-  return dummyExitBlk;
-} //normalizeInfiniteLoopExit
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
-  InstrT *branchInstr;
+  if (!LoopHeader || !LoopLatch)
+    return NULL;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return NULL;
+
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  MachineBasicBlock::iterator I = BranchMI;
+  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
+  llvm_unreachable("Extra register needed to handle CFG");
+  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
+  MachineInstrBuilder MIB(*FuncRep, NewMI);
+  MIB.addMBB(LoopHeader);
+  MIB.addReg(ImmReg, false);
+  SHOWNEWINSTR(NewMI);
+  BranchMI->eraseFromParent();
+  LoopLatch->addSuccessor(DummyExitBlk);
+
+  return DummyExitBlk;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
 
   // I saw two unconditional branch in one basic block in example
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
-  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
-          && CFGTraits::isUncondBranch(branchInstr)) {
-    DEBUG(
-          dbgs() << "Removing unconditional branch instruction" ;
-      branchInstr->dump();
-    );
-    branchInstr->eraseFromParent();
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    BranchMI->eraseFromParent();
   }
-} //removeUnconditionalBranch
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
-  if (srcBlk->succ_size() == 2) {
-    BlockT *blk1 = *srcBlk->succ_begin();
-    BlockT *blk2 = *(++srcBlk->succ_begin());
-
-    if (blk1 == blk2) {
-      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-      DEBUG(
-        dbgs() << "Removing unneeded conditional branch instruction" ;
-        branchInstr->dump();
-      );
-      branchInstr->eraseFromParent();
-      SHOWNEWBLK(blk1, "Removing redundant successor");
-      srcBlk->removeSuccessor(blk1);
-    }
-  }
-} //removeRedundantConditionalBranch
-
-template<class PassT>
-void CFGStructurizer<PassT>::addDummyExitBlock(SmallVectorImpl<BlockT *>
-                                               &retBlks) {
-  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(dummyExitBlk);  //insert to function
-  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
-
-  for (typename SmallVectorImpl<BlockT *>::iterator iter =
-         retBlks.begin(),
-       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
-    if (curInstr) {
-      curInstr->eraseFromParent();
-    }
-    curBlk->addSuccessor(dummyExitBlk);
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *(++MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1);
+}
+
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
     DEBUG(
-      dbgs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
              << " successors\n";
     );
-  } //for
-
-  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
-} //addDummyExitBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
-  while (srcBlk->succ_size()) {
-    srcBlk->removeSuccessor(*srcBlk->succ_begin());
   }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
-
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
-  }
-
-  srcBlkInfo->sccNum = sccNum;
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
 }
 
-template<class PassT>
-int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
   DEBUG(
-        dbgs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
   );
 
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
 
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
-  }
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
 
-  srcBlkInfo->isRetired = true;
-  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
          && "can't retire block yet");
 }
 
-template<class PassT>
-bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return (srcBlkInfo && srcBlkInfo->isRetired);
-}
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-
-    if(loopLand == NULL)
-      return true;
-
-    BlockT *landBlk = loopLand->landBlk;
-    assert(landBlk);
-    if (!isRetiredBlock(landBlk)) {
-      return true;
-    }
-
-    loopRep = loopRep->getParentLoop();
+void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
+  if (!MBB) {
+    MBB = FuncRep->CreateMachineBasicBlock();
+    FuncRep->push_back(MBB);  //insert to function
+    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
   }
-
-  return false;
-} //isActiveLoophead
-
-template<class PassT>
-bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
-  const unsigned blockSizeThreshold = 30;
-  const unsigned cloneInstrThreshold = 100;
-
-  bool multiplePreds = blk && (blk->pred_size() > 1);
-
-  if(!multiplePreds)
-    return false;
-
-  unsigned blkSize = blk->size();
-  return ((blkSize > blockSizeThreshold)
-          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
-} //needMigrateBlock
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
-                                            BlockTSmallerVector &exitBlks,
-                                            std::set<BlockT *> &exitBlkSet) {
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
-
-  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-       predIterEnd = landBlk->pred_end();
-       predIter != predIterEnd; ++predIter) {
-    BlockT *curBlk = *predIter;
-    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
-      inpathBlks.push_back(curBlk);
-    }
-  } //for
-
-  //if landBlk has predecessors that are not in the given loop,
-  //create a new block
-  BlockT *newLandBlk = landBlk;
-  if (inpathBlks.size() != landBlk->pred_size()) {
-    newLandBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(newLandBlk);  //insert to function
-    newLandBlk->addSuccessor(landBlk);
-    for (typename SmallVectorImpl<BlockT *>::iterator iter =
-         inpathBlks.begin(),
-         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
-      BlockT *curBlk = *iter;
-      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
-      //srcBlk, oldBlk, newBlk
-      curBlk->removeSuccessor(landBlk);
-      curBlk->addSuccessor(newLandBlk);
-    }
-    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
-      if (exitBlks[i] == landBlk) {
-        exitBlks[i] = newLandBlk;
-      }
-    }
-    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
-  }
-
-  setLoopLandBlock(loopRep, newLandBlk);
-
-  return newLandBlk;
-} // recordLoopbreakLand
-
-template<class PassT>
-void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  assert(theEntry->landBlk == NULL);
-
-  if (blk == NULL) {
-    blk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(blk);  //insert to function
-    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
-  }
-
-  theEntry->landBlk = blk;
-
+  TheEntry = MBB;
   DEBUG(
     dbgs() << "setLoopLandBlock loop-header = BB"
            << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << blk->getNumber() << "\n";
-  );
-} // setLoopLandBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-
-  theEntry->breakOnRegs.insert(regNum);
-
-  DEBUG(
-    dbgs() << "addLoopBreakOnReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  );
-} // addLoopBreakOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contOnRegs.insert(regNum);
-
-  DEBUG(
-    dbgs() << "addLoopContOnReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  );
-} // addLoopContOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->breakInitRegs.insert(regNum);
-
-  DEBUG(
-    dbgs() << "addLoopBreakInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  );
-} // addLoopBreakInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contInitRegs.insert(regNum);
-
-  DEBUG(
-    dbgs() << "addLoopContInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
+           << "  landing-block = BB" << MBB->getNumber() << "\n";
   );
-} // addLoopContInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
-                                                     RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->endbranchInitRegs.insert(regNum);
-
-  DEBUG(
-        dbgs() << "addLoopEndbranchInitReg loop-header = BB"
-      << loopRep->getHeader()->getNumber()
-      << "  regNum = " << regNum << "\n";
-  );
-} // addLoopEndbranchInitReg
-
-template<class PassT>
-typename CFGStructurizer<PassT>::LoopLandInfo *
-CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry;
-} // getLoopLandInfo
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry ? theEntry->landBlk : NULL;
-} // getLoopLandBlock
-
-
-template<class PassT>
-bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  if (loopRep == NULL)
-    return false;
-
-  BlockT *loopHeader = loopRep->getHeader();
-
-  return curBlk->isSuccessor(loopHeader);
-
-} //hasBackEdge
-
-template<class PassT>
-unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
-  return loopRep ? loopRep->getLoopDepth() : 0;
-} //getLoopDepth
-
-template<class PassT>
-int CFGStructurizer<PassT>::countActiveBlock
-(typename SmallVectorImpl<BlockT *>::const_iterator iterStart,
- typename SmallVectorImpl<BlockT *>::const_iterator iterEnd) {
-  int count = 0;
-  while (iterStart != iterEnd) {
-    if (!isRetiredBlock(*iterStart)) {
-      ++count;
-    }
-    ++iterStart;
-  }
-
-  return count;
-} //countActiveBlock
-
-// This is work around solution for findNearestCommonDominator not avaiable to
-// post dom a proper fix should go to Dominators.h.
+}
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT*
-CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
 
-  if (postDomTree->dominates(blk1, blk2)) {
-    return blk1;
-  }
-  if (postDomTree->dominates(blk2, blk1)) {
-    return blk2;
-  }
+  if (PDT->dominates(MBB1, MBB2))
+    return MBB1;
+  if (PDT->dominates(MBB2, MBB1))
+    return MBB2;
 
-  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
-  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
+  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
 
   // Handle newly cloned node.
-  if (node1 == NULL && blk1->succ_size() == 1) {
-    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
-  }
-  if (node2 == NULL && blk2->succ_size() == 1) {
-    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
-  }
+  if (!Node1 && MBB1->succ_size() == 1)
+    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
+  if (!Node2 && MBB2->succ_size() == 1)
+    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
-  if (node1 == NULL || node2 == NULL) {
+  if (!Node1 || !Node2)
     return NULL;
-  }
 
-  node1 = node1->getIDom();
-  while (node1) {
-    if (postDomTree->dominates(node1, node2)) {
-      return node1->getBlock();
-    }
-    node1 = node1->getIDom();
+  Node1 = Node1->getIDom();
+  while (Node1) {
+    if (PDT->dominates(Node1, Node2))
+      return Node1->getBlock();
+    Node1 = Node1->getIDom();
   }
 
   return NULL;
 }
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::findNearestCommonPostDom
-(typename std::set<BlockT *> &blks) {
-  BlockT *commonDom;
-  typename std::set<BlockT *>::const_iterator iter = blks.begin();
-  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
-  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
-    BlockT *curBlk = *iter;
-    if (curBlk != commonDom) {
-      commonDom = findNearestCommonPostDom(curBlk, commonDom);
-    }
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(
+    std::set<MachineBasicBlock *> &MBBs) {
+  MachineBasicBlock *CommonDom;
+  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
+  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
+  for (CommonDom = *It; It != E && CommonDom; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (MBB != CommonDom)
+      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
   }
 
   DEBUG(
     dbgs() << "Common post dominator for exit blocks is ";
-    if (commonDom) {
-          dbgs() << "BB" << commonDom->getNumber() << "\n";
-    } else {
+    if (CommonDom)
+          dbgs() << "BB" << CommonDom->getNumber() << "\n";
+    else
       dbgs() << "NULL\n";
-    }
   );
 
-  return commonDom;
-} //findNearestCommonPostDom
-
-} // end anonymous namespace
-
-//todo: move-end
-
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructurizer for AMDGPU
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
-public:
-  typedef MachineInstr              InstructionType;
-  typedef MachineFunction           FunctionType;
-  typedef MachineBasicBlock         BlockType;
-  typedef MachineLoopInfo           LoopinfoType;
-  typedef MachineDominatorTree      DominatortreeType;
-  typedef MachinePostDominatorTree  PostDominatortreeType;
-  typedef MachineDomTreeNode        DomTreeNodeType;
-  typedef MachineLoop               LoopType;
-
-protected:
-  TargetMachine &TM;
-
-public:
-  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
-  const TargetInstrInfo *getTargetInstrInfo() const;
-  const AMDGPURegisterInfo *getTargetRegisterInfo() const;
-};
-
-} // end anonymous namespace
-AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
-  : MachineFunctionPass(pid), TM(tm) {
-}
-
-const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
-  return TM.getInstrInfo();
-}
-
-const AMDGPURegisterInfo *AMDGPUCFGStructurizer::getTargetRegisterInfo() const {
-  return static_cast<const AMDGPURegisterInfo *>(TM.getRegisterInfo());
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGPrepare
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
-
-public:
-  AMDGPUCFGPrepare(TargetMachine &tm);
-
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
-  bool runOnMachineFunction(MachineFunction &F);
-};
-
-char AMDGPUCFGPrepare::ID = 0;
-} // end anonymous namespace
-
-AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
-  : AMDGPUCFGStructurizer(ID, tm )  {
-}
-const char *AMDGPUCFGPrepare::getPassName() const {
-  return "AMD IL Control Flow Graph Preparation Pass";
-}
-
-void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGPerform
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
-
-public:
-  AMDGPUCFGPerform(TargetMachine &tm);
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-  bool runOnMachineFunction(MachineFunction &F);
-};
-
-char AMDGPUCFGPerform::ID = 0;
-} // end anonymous namespace
-
-  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
-: AMDGPUCFGStructurizer(ID, tm) {
-}
-
-const char *AMDGPUCFGPerform::getPassName() const {
-  return "AMD IL Control Flow Graph structurizer Pass";
+  return CommonDom;
 }
 
-void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructTraits<AMDGPUCFGStructurizer>
-//
-//===----------------------------------------------------------------------===//
-
-namespace {
-// this class is tailor to the AMDGPU backend
-template<>
-struct CFGStructTraits<AMDGPUCFGStructurizer> {
-  typedef int RegiT;
-
-  static int getBranchNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
-    default:
-      llvm_unreachable("internal error");
-    }
-    return -1;
-  }
-
-  static int getBranchZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
-    default:
-      llvm_unreachable("internal error");
-    }
-    return -1;
-  }
-
-  static int getContinueNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
-    default:
-      llvm_unreachable("internal error");
-    };
-    return -1;
-  }
+char AMDGPUCFGStructurizer::ID = 0;
 
-  static int getContinueZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
-    default:
-      llvm_unreachable("internal error");
-    }
-    return -1;
-  }
-
-  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
-    return instr->getOperand(0).getMBB();
-  }
-
-  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
-    instr->getOperand(0).setMBB(blk);
-  }
-
-  static MachineBasicBlock *
-  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(blk->succ_size() == 2);
-    MachineBasicBlock *trueBranch = getTrueBranch(instr);
-    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
-    MachineBasicBlock::succ_iterator iterNext = iter;
-    ++iterNext;
-
-    return (*iter == trueBranch) ? *iterNext : *iter;
-  }
-
-  static bool isCondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-      case AMDGPU::JUMP_COND:
-      case AMDGPU::BRANCH_COND_i32:
-      case AMDGPU::BRANCH_COND_f32:
-      break;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static bool isUncondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-    case AMDGPU::JUMP:
-    case AMDGPU::BRANCH:
-      return true;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
-    //get DebugLoc from the first MachineBasicBlock instruction with debug info
-    DebugLoc DL;
-    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getDebugLoc().isUnknown() == false) {
-        DL = instr->getDebugLoc();
-      }
-    }
-    return DL;
-  }
-
-  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    MachineInstr *instr = &*iter;
-    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
-      return instr;
-    }
-    return NULL;
-  }
-
-  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
-  //
-  // BB with backward-edge could have move instructions after the branch
-  // instruction.  Such move instruction "belong to" the loop backward-edge.
-  //
-  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
-    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
-                                  blk->getParent()->getTarget().getInstrInfo());
-
-    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
-         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
-      // FIXME: Simplify
-      MachineInstr *instr = &*iter;
-      if (instr) {
-        if (isCondBranch(instr) || isUncondBranch(instr)) {
-          return instr;
-        } else if (!TII->isMov(instr->getOpcode())) {
-          break;
-        }
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::RETURN) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::CONTINUE) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
-    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static bool isReturnBlock(MachineBasicBlock *blk) {
-    MachineInstr *instr = getReturnInstr(blk);
-    bool isReturn = (blk->succ_size() == 0);
-    if (instr) {
-      assert(isReturn);
-    } else if (isReturn) {
-      DEBUG(
-        dbgs() << "BB" << blk->getNumber()
-               <<" is return block without RETURN instr\n";
-      );
-    }
-
-    return  isReturn;
-  }
-
-  static MachineBasicBlock::iterator
-  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(instr->getParent() == blk && "instruction doesn't belong to block");
-    MachineBasicBlock::iterator iter = blk->begin();
-    MachineBasicBlock::iterator iterEnd = blk->end();
-    while (&(*iter) != instr && iter != iterEnd) {
-      ++iter;
-    }
-
-    assert(iter != iterEnd);
-    return iter;
-  }//getInstrPos
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep) {
-    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrBefore
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    MachineBasicBlock::iterator res;
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep) {
-    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrEnd
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-   MachineInstr *newInstr = blk->getParent()
-      ->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->push_back(newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-  } //insertInstrEnd
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
-                                         int newOpcode, 
-                                         AMDGPUCFGStructurizer *passRep) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
-                                           DebugLoc());
-
-    blk->insert(instrPos, newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     DebugLoc DL) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->insert(instrPos, newInstr);
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(oldInstr->getOperand(1).getReg(), false);
-
-    SHOWNEWINSTR(newInstr);
-    //erase later oldInstr->eraseFromParent();
-  } //insertCondBranchBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock *blk,
-                                     MachineBasicBlock::iterator insertPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     RegiT regNum,
-                                     DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    //insert before
-    blk->insert(insertPos, newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchBefore
-
-  static void insertCondBranchEnd(MachineBasicBlock *blk,
-                                  int newOpcode,
-                                  AMDGPUCFGStructurizer *passRep,
-                                  RegiT regNum) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
-
-    blk->push_back(newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchEnd
-
-
-  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    blk->insert(instrPos, newInstr);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertAssignInstrBefore
-
-  static void insertAssignInstrBefore(MachineBasicBlock *blk,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-  } //insertInstrBefore
-
-  static void insertCompareInstrBefore(MachineBasicBlock *blk,
-                                       MachineBasicBlock::iterator instrPos,
-                                       AMDGPUCFGStructurizer *passRep,
-                                       RegiT dstReg, RegiT src1Reg,
-                                       RegiT src2Reg) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
-
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(dstReg, RegState::Define); //set target
-    MIB.addReg(src1Reg); //set src value
-    MIB.addReg(src2Reg); //set src value
-
-    blk->insert(instrPos, newInstr);
-    SHOWNEWINSTR(newInstr);
-
-  } //insertCompareInstrBefore
-
-  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
-                                 MachineBasicBlock *srcBlk) {
-    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
-         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
-      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
-    }
-  } //cloneSuccessorList
-
-  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
-    MachineFunction *func = srcBlk->getParent();
-    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
-    func->push_back(newBlk);  //insert to function
-    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
-         iterEnd = srcBlk->end();
-         iter != iterEnd; ++iter) {
-      MachineInstr *instr = func->CloneMachineInstr(iter);
-      newBlk->push_back(instr);
-    }
-    return newBlk;
-  }
-
-  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
-  //the AMDGPU instruction is not recognized as terminator fix this and retire
-  //this routine
-  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
-                                         MachineBasicBlock *oldBlk,
-                                         MachineBasicBlock *newBlk) {
-    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
-    if (branchInstr && isCondBranch(branchInstr) &&
-        getTrueBranch(branchInstr) == oldBlk) {
-      setTrueBranch(branchInstr, newBlk);
-    }
-  }
-
-  static void wrapup(MachineBasicBlock *entryBlk) {
-    assert((!entryBlk->getParent()->getJumpTableInfo()
-            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
-           && "found a jump table");
-
-     //collect continue right before endloop
-     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
-     MachineBasicBlock::iterator pre = entryBlk->begin();
-     MachineBasicBlock::iterator iterEnd = entryBlk->end();
-     MachineBasicBlock::iterator iter = pre;
-     while (iter != iterEnd) {
-       if (pre->getOpcode() == AMDGPU::CONTINUE
-           && iter->getOpcode() == AMDGPU::ENDLOOP) {
-         contInstr.push_back(pre);
-       }
-       pre = iter;
-       ++iter;
-     } //end while
-
-     //delete continue right before endloop
-     for (unsigned i = 0; i < contInstr.size(); ++i) {
-        contInstr[i]->eraseFromParent();
-     }
-
-     // TODO to fix up jump table so later phase won't be confused.  if
-     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
-     // there isn't such an interface yet.  alternatively, replace all the other
-     // blocks in the jump table with the entryBlk //}
-
-  } //wrapup
-
-  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineDominatorTree>();
-  }
-
-  static MachinePostDominatorTree*
-  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachinePostDominatorTree>();
-  }
-
-  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineLoopInfo>();
-  }
-}; // template class CFGStructTraits
 } // end anonymous namespace
 
-// createAMDGPUCFGPreparationPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm) {
-  return new AMDGPUCFGPrepare(tm);
-}
-
-bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
-  return CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func, *this,
-                                                       getTargetRegisterInfo());
-}
 
-// createAMDGPUCFGStructurizerPass- Returns a pass
 FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
-  return new AMDGPUCFGPerform(tm);
-}
-
-bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
-  return CFGStructurizer<AMDGPUCFGStructurizer>().run(func, *this,
-                                                      getTargetRegisterInfo());
+  return new AMDGPUCFGStructurizer(tm);
 }


_______________________________________________
llvm-commits mailing list
llvm-commits at cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits






More information about the llvm-commits mailing list