[llvm-commits] [PATCH 3/6] AMDIL: Add Function passes for R600/SI codegen

Wed Apr 25 12:30:55 PDT 2012

---
 lib/Target/AMDIL/AMDGPUConvertToISA.cpp            |   65 +
 lib/Target/AMDIL/AMDGPULowerInstructions.cpp       |   82 +
 lib/Target/AMDIL/AMDGPULowerShaderInstructions.cpp |   38 +
 lib/Target/AMDIL/AMDGPULowerShaderInstructions.h   |   40 +
 .../AMDIL/AMDGPUReorderPreloadInstructions.cpp     |   66 +
 lib/Target/AMDIL/AMDILAlgorithms.tpp               |   93 +
 lib/Target/AMDIL/AMDILCFGStructurizer.cpp          | 3250 ++++++++++++++++++++
 lib/Target/AMDIL/AMDILCodeEmitter.h                |   46 +
 lib/Target/AMDIL/AMDILMCCodeEmitter.cpp            |  158 +
 lib/Target/AMDIL/AMDILMachinePeephole.cpp          |  173 ++
 lib/Target/AMDIL/AMDILPeepholeOptimizer.cpp        | 1138 +++++++
 lib/Target/AMDIL/R600CodeEmitter.cpp               |  749 +++++
 lib/Target/AMDIL/R600KernelParameters.cpp          |  503 +++
 lib/Target/AMDIL/R600KernelParameters.h            |   28 +
 lib/Target/AMDIL/R600LowerInstructions.cpp         |  502 +++
 lib/Target/AMDIL/R600LowerShaderInstructions.cpp   |  143 +
 lib/Target/AMDIL/R600OpenCLUtils.h                 |   49 +
 lib/Target/AMDIL/SIAssignInterpRegs.cpp            |  110 +
 lib/Target/AMDIL/SICodeEmitter.cpp                 |  274 ++
 lib/Target/AMDIL/SILowerShaderInstructions.cpp     |   90 +
 lib/Target/AMDIL/SIPropagateImmReads.cpp           |   70 +
 21 files changed, 7667 insertions(+), 0 deletions(-)
 create mode 100644 lib/Target/AMDIL/AMDGPUConvertToISA.cpp
 create mode 100644 lib/Target/AMDIL/AMDGPULowerInstructions.cpp
 create mode 100644 lib/Target/AMDIL/AMDGPULowerShaderInstructions.cpp
 create mode 100644 lib/Target/AMDIL/AMDGPULowerShaderInstructions.h
 create mode 100644 lib/Target/AMDIL/AMDGPUReorderPreloadInstructions.cpp
 create mode 100644 lib/Target/AMDIL/AMDILAlgorithms.tpp
 create mode 100644 lib/Target/AMDIL/AMDILCFGStructurizer.cpp
 create mode 100644 lib/Target/AMDIL/AMDILCodeEmitter.h
 create mode 100644 lib/Target/AMDIL/AMDILMCCodeEmitter.cpp
 create mode 100644 lib/Target/AMDIL/AMDILMachinePeephole.cpp
 create mode 100644 lib/Target/AMDIL/AMDILPeepholeOptimizer.cpp
 create mode 100644 lib/Target/AMDIL/R600CodeEmitter.cpp
 create mode 100644 lib/Target/AMDIL/R600KernelParameters.cpp
 create mode 100644 lib/Target/AMDIL/R600KernelParameters.h
 create mode 100644 lib/Target/AMDIL/R600LowerInstructions.cpp
 create mode 100644 lib/Target/AMDIL/R600LowerShaderInstructions.cpp
 create mode 100644 lib/Target/AMDIL/R600OpenCLUtils.h
 create mode 100644 lib/Target/AMDIL/SIAssignInterpRegs.cpp
 create mode 100644 lib/Target/AMDIL/SICodeEmitter.cpp
 create mode 100644 lib/Target/AMDIL/SILowerShaderInstructions.cpp
 create mode 100644 lib/Target/AMDIL/SIPropagateImmReads.cpp

diff --git a/lib/Target/AMDIL/AMDGPUConvertToISA.cpp b/lib/Target/AMDIL/AMDGPUConvertToISA.cpp
new file mode 100644
index 0000000..ce947f8
--- /dev/null
+++ b/lib/Target/AMDIL/AMDGPUConvertToISA.cpp
@@ -0,0 +1,65 @@
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers AMDIL machine instructions to the appropriate hardware
+// instructions. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class AMDGPUConvertToISAPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+    void lowerFLT(MachineInstr &MI);
+
+  public:
+    AMDGPUConvertToISAPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char AMDGPUConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
+  return new AMDGPUConvertToISAPass(tm);
+}
+
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const AMDGPUInstrInfo * TII =
+                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      MachineInstr * newInstr = TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
+      if (!newInstr) {
+        continue;
+      }
+      MBB.insert(I, newInstr);
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDIL/AMDGPULowerInstructions.cpp b/lib/Target/AMDIL/AMDGPULowerInstructions.cpp
new file mode 100644
index 0000000..b49d0dd
--- /dev/null
+++ b/lib/Target/AMDIL/AMDGPULowerInstructions.cpp
@@ -0,0 +1,82 @@
+//===-- AMDGPULowerInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class AMDGPULowerInstructionsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+    void lowerVCREATE_v4f32(MachineInstr &MI, MachineBasicBlock::iterator I,
+                              MachineBasicBlock &MBB, MachineFunction &MF);
+
+  public:
+    AMDGPULowerInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char AMDGPULowerInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerInstructionsPass(TargetMachine &tm) {
+  return new AMDGPULowerInstructionsPass(tm);
+}
+
+bool AMDGPULowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDIL::VCREATE_v4f32: lowerVCREATE_v4f32(MI, I, MBB, MF); break;
+
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+void AMDGPULowerInstructionsPass::lowerVCREATE_v4f32(MachineInstr &MI,
+    MachineBasicBlock::iterator I, MachineBasicBlock &MBB, MachineFunction &MF)
+{
+  MachineRegisterInfo & MRI = MF.getRegInfo();
+  unsigned tmp = MRI.createVirtualRegister(
+                  MRI.getRegClass(MI.getOperand(0).getReg()));
+
+  BuildMI(MBB, I, DebugLoc(), TM.getInstrInfo()->get(AMDIL::IMPLICIT_DEF), tmp);
+
+  BuildMI(MBB, I, DebugLoc(), TM.getInstrInfo()->get(AMDIL::INSERT_SUBREG))
+          .addOperand(MI.getOperand(0))
+          .addReg(tmp)
+          .addOperand(MI.getOperand(1))
+          .addImm(AMDIL::sel_x);
+}
diff --git a/lib/Target/AMDIL/AMDGPULowerShaderInstructions.cpp b/lib/Target/AMDIL/AMDGPULowerShaderInstructions.cpp
new file mode 100644
index 0000000..d33055c
--- /dev/null
+++ b/lib/Target/AMDIL/AMDGPULowerShaderInstructions.cpp
@@ -0,0 +1,38 @@
+//===-- AMDGPULowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPULowerShaderInstructions.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+void AMDGPULowerShaderInstructionsPass::preloadRegister(MachineFunction * MF,
+    const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) const
+{
+  if (!MRI->isLiveIn(physReg)) {
+    MRI->addLiveIn(physReg, virtReg);
+    MachineBasicBlock &EntryMBB = MF->front();
+    BuildMI(MF->front(), EntryMBB.begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            virtReg)
+            .addReg(physReg);
+  } else {
+    /* We can't mark the same register as preloaded twice, but we still must
+     * associate virtReg with the correct preloaded register. */
+    unsigned newReg = MRI->getLiveInVirtReg(physReg);
+    MRI->replaceRegWith(virtReg, newReg);
+  }
+}
diff --git a/lib/Target/AMDIL/AMDGPULowerShaderInstructions.h b/lib/Target/AMDIL/AMDGPULowerShaderInstructions.h
new file mode 100644
index 0000000..5ee77fa
--- /dev/null
+++ b/lib/Target/AMDIL/AMDGPULowerShaderInstructions.h
@@ -0,0 +1,40 @@
+//===-- AMDGPULowerShaderInstructions.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef AMDGPU_LOWER_SHADER_INSTRUCTIONS
+#define AMDGPU_LOWER_SHADER_INSTRUCTIONS
+
+namespace llvm {
+
+class MachineFunction;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+class AMDGPULowerShaderInstructionsPass {
+
+  protected:
+    MachineRegisterInfo * MRI;
+    /**
+     * @param physReg The physical register that will be preloaded.
+     * @param virtReg The virtual register that currently holds the
+     *                preloaded value.
+     */
+    void preloadRegister(MachineFunction * MF, const TargetInstrInfo * TII,
+                         unsigned physReg, unsigned virtReg) const;
+};
+
+} // end namespace llvm
+
+
+#endif // AMDGPU_LOWER_SHADER_INSTRUCTIONS
diff --git a/lib/Target/AMDIL/AMDGPUReorderPreloadInstructions.cpp b/lib/Target/AMDIL/AMDGPUReorderPreloadInstructions.cpp
new file mode 100644
index 0000000..c923f19
--- /dev/null
+++ b/lib/Target/AMDIL/AMDGPUReorderPreloadInstructions.cpp
@@ -0,0 +1,66 @@
+//===-- AMDGPUReorderPreloadInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Function.h"
+
+using namespace llvm;
+
+namespace {
+  class AMDGPUReorderPreloadInstructionsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    AMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+      bool runOnMachineFunction(MachineFunction &MF);
+
+      const char *getPassName() const { return "AMDGPU Reorder Preload Instructions"; }
+    };
+} /* End anonymous namespace */
+
+char AMDGPUReorderPreloadInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) {
+    return new AMDGPUReorderPreloadInstructionsPass(tm);
+}
+
+/* This pass moves instructions that represent preloaded registers to the
+ * start of the program. */
+bool AMDGPUReorderPreloadInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const AMDGPUInstrInfo * TII =
+                        static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      if (TII->isRegPreload(MI)) {
+         MF.front().insert(MF.front().begin(), MI.removeFromParent());
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDIL/AMDILAlgorithms.tpp b/lib/Target/AMDIL/AMDILAlgorithms.tpp
new file mode 100644
index 0000000..058475f
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILAlgorithms.tpp
@@ -0,0 +1,93 @@
+//===------ AMDILAlgorithms.tpp - AMDIL Template Algorithms Header --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides templates algorithms that extend the STL algorithms, but
+// are useful for the AMDIL backend
+//
+//===----------------------------------------------------------------------===//
+
+// A template function that loops through the iterators and passes the second
+// argument along with each iterator to the function. If the function returns
+// true, then the current iterator is invalidated and it moves back, before
+// moving forward to the next iterator, otherwise it moves forward without
+// issue. This is based on the for_each STL function, but allows a reference to
+// the second argument
+template<class InputIterator, class Function, typename Arg>
+Function binaryForEach(InputIterator First, InputIterator Last, Function F,
+                       Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    F(*First, Second);
+  }
+  return F;
+}
+
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryForEach(InputIterator First, InputIterator Last, Function F,
+                           Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    if (F(*First, Second)) {
+      --First;
+    }
+  }
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with the passed in argument. See binaryForEach for further
+// explanation
+template<class InputIterator, class Function, typename Arg>
+Function binaryNestedForEach(InputIterator First, InputIterator Last,
+                             Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    binaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryNestedForEach(InputIterator First, InputIterator Last,
+                                 Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    safeBinaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+
+// Unlike the STL, a pointer to the iterator itself is passed in with the 'safe'
+// versions of these functions This allows the function to handle situations
+// such as invalidated iterators
+template<class InputIterator, class Function>
+Function safeForEach(InputIterator First, InputIterator Last, Function F)
+{
+  for ( ; First!=Last; ++First )  F(&First)
+    ; // Do nothing.
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator. See binaryForEach for
+// further explanation
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F)
+{
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
diff --git a/lib/Target/AMDIL/AMDILCFGStructurizer.cpp b/lib/Target/AMDIL/AMDILCFGStructurizer.cpp
new file mode 100644
index 0000000..289af6f
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILCFGStructurizer.cpp
@@ -0,0 +1,3250 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "structcfg"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define FirstNonDebugInstr(A) A->begin()
+using namespace llvm;
+
+// bixia TODO: move this out to analysis lib. Make this work for both target
+// AMDIL and CBackend.
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
+    "pattern matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace llvmCFGStruct
+{
+#define SHOWNEWINSTR(i) \
+  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+
+#define SHOWNEWBLK(b, msg) \
+if (DEBUGME) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  errs() << "\n"; \
+}
+
+#define SHOWBLK_DETAIL(b, msg) \
+if (DEBUGME) { \
+  if (b) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(errs()); \
+  errs() << "\n"; \
+  } \
+}
+
+#define INVALIDSCCNUM -1
+#define INVALIDREGNUM 0
+
+template<class LoopinfoT>
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
+  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
+       iterEnd = LoopInfo.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->print(OS, 0);
+  }
+}
+
+template<class NodeT>
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} //end namespace llvmCFGStruct
+
+
+//===----------------------------------------------------------------------===//
+//
+// MachinePostDominatorTree
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
+/// to compute the a post-dominator tree.
+///
+struct MachinePostDominatorTree : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DominatorTreeBase<MachineBasicBlock> *DT;
+  MachinePostDominatorTree() : MachineFunctionPass(ID)
+  {
+    DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+    // postdominator
+  }
+
+  ~MachinePostDominatorTree();
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  inline const std::vector<MachineBasicBlock *> &getRoots() const {
+    return DT->getRoots();
+  }
+
+  inline MachineDomTreeNode *getRootNode() const {
+    return DT->getRootNode();
+  }
+
+  inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline MachineBasicBlock *
+  findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B) {
+    return DT->findNearestCommonDominator(A, B);
+  }
+
+  virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const {
+    DT->print(OS);
+  }
+};
+} //end of namespace llvm
+
+char MachinePostDominatorTree::ID = 0;
+static RegisterPass<MachinePostDominatorTree>
+machinePostDominatorTreePass("machinepostdomtree",
+                             "MachinePostDominator Tree Construction",
+                             true, true);
+
+//const PassInfo *const llvm::MachinePostDominatorsID
+//= &machinePostDominatorTreePass;
+
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  //DEBUG(DT->dump());
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+template<class PassT>
+struct CFGStructTraits {
+};
+
+template <class InstrT>
+class BlockInformation {
+public:
+  bool isRetired;
+  int  sccNum;
+  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
+  //Instructions defining the corresponding successor.
+  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
+};
+
+template <class BlockT, class InstrT, class RegiT>
+class LandInformation {
+public:
+  BlockT *landBlk;
+  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
+                                  //WHILELOOP(thisloop) init before entering
+                                  //thisloop.
+  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
+                                  //WHILELOOP(thisloop) init after entering
+                                  //thisloop.
+  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
+                                     //land block, branch cond on this reg.
+  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
+                                     //endif" after ENDLOOP(thisloop) break
+                                     //outerLoopOf(thisLoop).
+  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
+                                    //endif" after ENDLOOP(thisloop) continue on
+                                    //outerLoopOf(thisLoop).
+  LandInformation() : landBlk(NULL) {}
+};
+
+} //end of namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
+template<class PassT>
+class  CFGStructurizer
+{
+public:
+  typedef enum {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  } PathToKind;
+
+public:
+  typedef typename PassT::InstructionType         InstrT;
+  typedef typename PassT::FunctionType            FuncT;
+  typedef typename PassT::DominatortreeType       DomTreeT;
+  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
+  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
+  typedef typename PassT::LoopinfoType            LoopInfoT;
+
+  typedef GraphTraits<FuncT *>                    FuncGTraits;
+  //typedef FuncGTraits::nodes_iterator BlockIterator;
+  typedef typename FuncT::iterator                BlockIterator;
+
+  typedef typename FuncGTraits::NodeType          BlockT;
+  typedef GraphTraits<BlockT *>                   BlockGTraits;
+  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
+  //typedef BlockGTraits::succ_iterator InstructionIterator;
+  typedef typename BlockT::iterator               InstrIterator;
+
+  typedef CFGStructTraits<PassT>                  CFGTraits;
+  typedef BlockInformation<InstrT>                BlockInfo;
+  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
+
+  typedef int                                     RegiT;
+  typedef typename PassT::LoopType                LoopT;
+  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
+        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
+        //landing info for loop break
+  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+
+public:
+  CFGStructurizer();
+  ~CFGStructurizer();
+
+  /// Perform the CFG structurization
+  bool run(FuncT &Func, PassT &Pass);
+
+  /// Perform the CFG preparation
+  bool prepare(FuncT &Func, PassT &Pass);
+
+private:
+  void   orderBlocks();
+  void   printOrderedBlocks(llvm::raw_ostream &OS);
+  int patternMatch(BlockT *CurBlock);
+  int patternMatchGroup(BlockT *CurBlock);
+
+  int serialPatternMatch(BlockT *CurBlock);
+  int ifPatternMatch(BlockT *CurBlock);
+  int switchPatternMatch(BlockT *CurBlock);
+  int loopendPatternMatch(BlockT *CurBlock);
+  int loopPatternMatch(BlockT *CurBlock);
+
+  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  //int loopWithoutBreak(BlockT *);
+
+  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
+                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
+  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
+                           BlockT *ContBlock, LoopT *contLoop);
+  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
+  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                       BlockT *FalseBlock);
+  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
+                          BlockT *FalseBlock);
+  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                              BlockT *FalseBlock, BlockT **LandBlockPtr);
+  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                                   BlockT *FalseBlock, BlockT *LandBlock,
+                                   bool Detail = false);
+  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
+                          bool AllowSideEntry = true);
+  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
+                        bool AllowSideEntry = true);
+  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
+  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
+
+  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
+                            BlockT *TrueBlock, BlockT *FalseBlock,
+                            BlockT *LandBlock);
+  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
+  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
+                           BlockT *ExitLandBlock, RegiT SetReg);
+  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
+                           RegiT SetReg);
+  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
+                                std::set<BlockT*> &ExitBlockSet,
+                                BlockT *ExitLandBlk);
+  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
+                                BlockTSmallerVector &ExitingBlocks,
+                                BlockTSmallerVector &ExitBlocks);
+  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
+  void removeUnconditionalBranch(BlockT *SrcBlock);
+  void removeRedundantConditionalBranch(BlockT *SrcBlock);
+  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
+
+  void removeSuccessor(BlockT *SrcBlock);
+  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
+  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
+
+  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
+                          InstrIterator InsertPos);
+
+  void recordSccnum(BlockT *SrcBlock, int SCCNum);
+  int getSCCNum(BlockT *srcBlk);
+
+  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
+  bool isRetiredBlock(BlockT *SrcBlock);
+  bool isActiveLoophead(BlockT *CurBlock);
+  bool needMigrateBlock(BlockT *Block);
+
+  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
+                              BlockTSmallerVector &exitBlocks,
+                              std::set<BlockT*> &ExitBlockSet);
+  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
+  BlockT *getLoopLandBlock(LoopT *LoopRep);
+  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
+
+  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
+
+  bool hasBackEdge(BlockT *curBlock);
+  unsigned getLoopDepth  (LoopT *LoopRep);
+  int countActiveBlock(
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
+    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
+  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+
+private:
+  DomTreeT *domTree;
+  PostDomTreeT *postDomTree;
+  LoopInfoT *loopInfo;
+  PassT *passRep;
+  FuncT *funcRep;
+
+  BlockInfoMap blockInfoMap;
+  LoopLandInfoMap loopLandInfoMap;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
+
+};  //template class CFGStructurizer
+
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
+  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+}
+
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
+  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
+       E = blockInfoMap.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass) {
+  passRep = &pass;
+  funcRep = &func;
+
+  bool changed = false;
+  //func.RenumberBlocks();
+
+  //to do, if not reducible flow graph, make it so ???
+
+  if (DEBUGME) {
+        errs() << "AMDILCFGStructurizer::prepare\n";
+    //func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //domTree = CFGTraits::getDominatorTree(pass);
+      //if (DEBUGME) {
+      //    domTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //postDomTree = CFGTraits::getPostDominatorTree(pass);
+      //if (DEBUGME) {
+      //   postDomTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+
+  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
+       iterEnd = loopInfo->end();
+       iter != iterEnd; ++iter) {
+    LoopT* loopRep = (*iter);
+    BlockTSmallerVector exitingBlks;
+    loopRep->getExitingBlocks(exitingBlks);
+    
+    if (exitingBlks.size() == 0) {
+      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
+      if (dummyExitBlk != NULL)
+        retBlks.push_back(dummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
+       iterBlk != iterEndBlk;
+       ++iterBlk) {
+    BlockT *curBlk = *iterBlk;
+    removeUnconditionalBranch(curBlk);
+    removeRedundantConditionalBranch(curBlk);
+    if (CFGTraits::isReturnBlock(curBlk)) {
+      retBlks.push_back(curBlk);
+    }
+    assert(curBlk->succ_size() <= 2);
+    //assert(curBlk->size() > 0);
+    //removeEmptyBlock(curBlk) ??
+  } //for
+
+  if (retBlks.size() >= 2) {
+    addDummyExitBlock(retBlks);
+    changed = true;
+  }
+
+  return changed;
+} //CFGStructurizer::prepare
+
+template<class PassT>
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass) {
+  passRep = &pass;
+  funcRep = &func;
+
+  //func.RenumberBlocks();
+
+  //Assume reducible CFG...
+  if (DEBUGME) {
+    errs() << "AMDILCFGStructurizer::run\n";
+    //errs() << func.getFunction()->getNameStr() << "\n";
+    func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+#if 1
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  domTree = CFGTraits::getDominatorTree(pass);
+  if (DEBUGME) {
+    domTree->print(errs(), (const llvm::Module*)0);
+  }
+#endif
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  postDomTree = CFGTraits::getPostDominatorTree(pass);
+  if (DEBUGME) {
+    postDomTree->print(errs());
+  }
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+//#define STRESSTEST
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+  int numIter = 0;
+  bool finish = false;
+  BlockT *curBlk;
+  bool makeProgress = false;
+  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
+                                        orderedBlks.end());
+
+  do {
+    ++numIter;
+    if (DEBUGME) {
+      errs() << "numIter = " << numIter
+             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+    }
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin();
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlkEnd = orderedBlks.end();
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      sccBeginIter = iterBlk;
+    BlockT *sccBeginBlk = NULL;
+    int sccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int sccNumIter;     // Number of iteration in this SCC.
+
+    while (iterBlk != iterBlkEnd) {
+      curBlk = *iterBlk;
+
+      if (sccBeginBlk == NULL) {
+        sccBeginIter = iterBlk;
+        sccBeginBlk = curBlk;
+        sccNumIter = 0;
+        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+        if (DEBUGME) {
+              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              errs() << "\n";
+        }
+      }
+
+      if (!isRetiredBlock(curBlk)) {
+        patternMatch(curBlk);
+      }
+
+      ++iterBlk;
+
+      bool contNextScc = true;
+      if (iterBlk == iterBlkEnd
+          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+        // Just finish one scc.
+        ++sccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+          if (DEBUGME) {
+            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
+                   << ", sccNumIter = " << sccNumIter;
+            errs() << "doesn't make any progress\n";
+          }
+          contNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
+          sccNumBlk = sccRemainedNumBlk;
+          iterBlk = sccBeginIter;
+          contNextScc = false;
+          if (DEBUGME) {
+            errs() << "repeat processing SCC" << getSCCNum(curBlk)
+                   << "sccNumIter = " << sccNumIter << "\n";
+            func.viewCFG();
+            //func.viewCFGOnly();
+          }
+        } else {
+          // Finish the current scc.
+          contNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        contNextScc = false;
+      }
+
+      if (contNextScc) {
+        sccBeginBlk = NULL;
+      }
+    } //while, "one iteration" over the function.
+
+    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
+    if (entryBlk->succ_size() == 0) {
+      finish = true;
+      if (DEBUGME) {
+        errs() << "Reduce to one block\n";
+      }
+    } else {
+      int newnumRemainedBlk
+        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      // consider cloned blocks ??
+      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
+        makeProgress = true;
+        numRemainedBlk = newnumRemainedBlk;
+      } else {
+        makeProgress = false;
+        if (DEBUGME) {
+          errs() << "No progress\n";
+        }
+      }
+    }
+  } while (!finish && makeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+
+  // Detach retired Block, release memory.
+  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
+       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    if ((*iterMap).second && (*iterMap).second->isRetired) {
+      assert(((*iterMap).first)->getNumber() != -1);
+      if (DEBUGME) {
+        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+      }
+      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*iterMap).second;
+  }
+  blockInfoMap.clear();
+
+  // clear loopLandInfoMap
+  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
+       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    delete (*iterMap).second;
+  }
+  loopLandInfoMap.clear();
+
+  if (DEBUGME) {
+    func.viewCFG();
+    //func.dump();
+  }
+
+  if (!finish) {
+    assert(!"IRREDUCIBL_CF");
+  }
+
+  return true;
+} //CFGStructurizer::run
+
+/// Print the ordered Blocks.
+///
+template<class PassT>
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
+  size_t i = 0;
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
+       iterBlk != iterBlkEnd;
+       ++iterBlk, ++i) {
+    os << "BB" << (*iterBlk)->getNumber();
+    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+    if (i != 0 && i % 10 == 0) {
+      os << "\n";
+    } else {
+      os << " ";
+    }
+  }
+} //printOrderedBlocks
+
+/// Compute the reversed DFS post order of Blocks
+///
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
+  int sccNum = 0;
+  BlockT *bb;
+  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
+       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
+    std::vector<BlockT *> &sccNext = *sccIter;
+    for (typename std::vector<BlockT *>::const_iterator
+         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      bb = *blockIter;
+      orderedBlks.push_back(bb);
+      recordSccnum(bb, sccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
+       blockEnd1 = FuncGTraits::nodes_end(funcRep);
+       blockIter1 != blockEnd1; ++blockIter1) {
+    BlockT *bb = &(*blockIter1);
+    sccNum = getSCCNum(bb);
+    if (sccNum == INVALIDSCCNUM) {
+      errs() << "unreachable block BB" << bb->getNumber() << "\n";
+    }
+  } //end of for
+} //orderBlocks
+
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
+  int numMatch = 0;
+  int curMatch;
+
+  if (DEBUGME) {
+        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+  }
+
+  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
+    numMatch += curMatch;
+  }
+
+  if (DEBUGME) {
+        errs() << "End patternMatch BB" << curBlk->getNumber()
+      << ", numMatch = " << numMatch << "\n";
+  }
+
+  return numMatch;
+} //patternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
+  int numMatch = 0;
+  numMatch += serialPatternMatch(curBlk);
+  numMatch += ifPatternMatch(curBlk);
+  //numMatch += switchPatternMatch(curBlk);
+  numMatch += loopendPatternMatch(curBlk);
+  numMatch += loopPatternMatch(curBlk);
+  return numMatch;
+}//patternMatchGroup
+
+template<class PassT>
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 1) {
+    return 0;
+  }
+
+  BlockT *childBlk = *curBlk->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+    return 0;
+  }
+
+  mergeSerialBlock(curBlk, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+} //serialPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+  //two edges
+  if (curBlk->succ_size() != 2) {
+    return 0;
+  }
+
+  if (hasBackEdge(curBlk)) {
+    return 0;
+  }
+
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
+  if (branchInstr == NULL) {
+    return 0;
+  }
+
+  assert(CFGTraits::isCondBranch(branchInstr));
+
+  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
+  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
+  BlockT *landBlk;
+  int cloned = 0;
+
+  // TODO: Simplify
+  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
+    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
+    landBlk = *trueBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
+    landBlk = NULL;
+  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
+    landBlk = falseBlk;
+    falseBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && *falseBlk->succ_begin() == trueBlk) {
+    landBlk = trueBlk;
+    trueBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
+    landBlk = *falseBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 1
+    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
+    landBlk = *trueBlk->succ_begin();
+  } else {
+    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (landBlk != NULL &&
+      ((trueBlk && trueBlk->pred_size() > 1)
+      || (falseBlk && falseBlk->pred_size() > 1))) {
+     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  }
+
+  if (trueBlk && trueBlk->pred_size() > 1) {
+    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
+    ++cloned;
+  }
+
+  if (falseBlk && falseBlk->pred_size() > 1) {
+    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
+    ++cloned;
+  }
+
+  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += cloned;
+
+  return 1 + cloned;
+} //ifPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
+  return 0;
+} //switchPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  typename std::vector<LoopT *> nestedLoops;
+  while (loopRep) {
+    nestedLoops.push_back(loopRep);
+    loopRep = loopRep->getParentLoop();
+  }
+
+  if (nestedLoops.size() == 0) {
+    return 0;
+  }
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int num = 0;
+  for (typename std::vector<LoopT *>::reverse_iterator
+       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
+       iter != iterEnd; ++iter) {
+    loopRep = *iter;
+
+    if (getLoopLandBlock(loopRep) != NULL) {
+      continue;
+    }
+
+    BlockT *loopHeader = loopRep->getHeader();
+
+    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+
+    if (numBreak == -1) {
+      break;
+    }
+
+    int numCont = loopcontPatternMatch(loopRep, loopHeader);
+    num += numBreak + numCont;
+  }
+
+  return num;
+} //loopendPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 0) {
+    return 0;
+  }
+
+  int numLoop = 0;
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+    if (loopLand) {
+      BlockT *landBlk = loopLand->landBlk;
+      assert(landBlk);
+      if (!isRetiredBlock(landBlk)) {
+        mergeLooplandBlock(curBlk, loopLand);
+        ++numLoop;
+      }
+    }
+    loopRep = loopRep->getParentLoop();
+  }
+
+  numLoopPatternMatch += numLoop;
+
+  return numLoop;
+} //loopPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
+                                                  BlockT *loopHeader) {
+  BlockTSmallerVector exitingBlks;
+  loopRep->getExitingBlocks(exitingBlks);
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
+  }
+
+  if (exitingBlks.size() == 0) {
+    setLoopLandBlock(loopRep);
+    return 0;
+  }
+
+  // Compute the corresponding exitBlks and exit block set.
+  BlockTSmallerVector exitBlks;
+  std::set<BlockT *> exitBlkSet;
+  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
+       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *exitingBlk = *iter;
+    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+    exitBlks.push_back(exitBlk);
+    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  }
+
+  assert(exitBlkSet.size() > 0);
+  assert(exitBlks.size() == exitingBlks.size());
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  }
+
+  // Find exitLandBlk.
+  BlockT *exitLandBlk = NULL;
+  int numCloned = 0;
+  int numSerial = 0;
+
+  if (exitBlkSet.size() == 1)
+  {
+    exitLandBlk = *exitBlkSet.begin();
+  } else {
+    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+
+    if (exitLandBlk == NULL) {
+      return -1;
+    }
+
+    bool allInPath = true;
+    bool allNotInPath = true;
+    for (typename std::set<BlockT*>::const_iterator
+         iter = exitBlkSet.begin(),
+         iterEnd = exitBlkSet.end();
+         iter != iterEnd; ++iter) {
+      BlockT *exitBlk = *iter;
+
+      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
+      if (DEBUGME) {
+        errs() << "BB" << exitBlk->getNumber()
+               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
+               << pathKind << "\n";
+      }
+
+      allInPath = allInPath && (pathKind == SinglePath_InPath);
+      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+
+      if (!allInPath && !allNotInPath) {
+        if (DEBUGME) {
+              errs() << "singlePath check fail\n";
+        }
+        return -1;
+      }
+    } // check all exit blocks
+
+    if (allNotInPath) {
+#if 1
+
+      // TODO: Simplify, maybe separate function?
+      //funcRep->viewCFG();
+      LoopT *parentLoopRep = loopRep->getParentLoop();
+      BlockT *parentLoopHeader = NULL;
+      if (parentLoopRep)
+        parentLoopHeader = parentLoopRep->getHeader();
+
+      if (exitLandBlk == parentLoopHeader &&
+          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
+                                               loopRep,
+                                               exitBlkSet,
+                                               exitLandBlk)) != NULL) {
+        if (DEBUGME) {
+          errs() << "relocateLoopcontBlock success\n";
+        }
+      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
+                                                      exitingBlks,
+                                                      exitBlks)) != NULL) {
+        if (DEBUGME) {
+          errs() << "insertEndbranchBlock success\n";
+        }
+      } else {
+        if (DEBUGME) {
+          errs() << "loop exit fail\n";
+        }
+        return -1;
+      }
+#else
+      return -1;
+#endif
+    }
+
+    // Handle side entry to exit path.
+    exitBlks.clear();
+    exitBlkSet.clear();
+    for (typename BlockTSmallerVector::iterator iterExiting =
+           exitingBlks.begin(),
+         iterExitingEnd = exitingBlks.end();
+         iterExiting != iterExitingEnd; ++iterExiting) {
+      BlockT *exitingBlk = *iterExiting;
+      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+      BlockT *newExitBlk = exitBlk;
+
+      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
+        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
+        ++numCloned;
+      }
+
+      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+
+      exitBlks.push_back(newExitBlk);
+      exitBlkSet.insert(newExitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      numSerial += serialPatternMatch(exitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      if (exitBlk->pred_size() > 1) {
+        if (exitBlk != exitLandBlk) {
+          return -1;
+        }
+      } else {
+        if (exitBlk != exitLandBlk &&
+            (exitBlk->succ_size() != 1 ||
+            *exitBlk->succ_begin() != exitLandBlk)) {
+          return -1;
+        }
+      }
+    }
+  } // else
+
+  // LoopT *exitLandLoop = loopInfo->getLoopFor(exitLandBlk);
+  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+
+  // Fold break into the breaking block. Leverage across level breaks.
+  assert(exitingBlks.size() == exitBlks.size());
+  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
+       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
+       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
+    BlockT *exitBlk = *iterExit;
+    BlockT *exitingBlk = *iterExiting;
+    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
+    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
+    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
+  }
+
+  int numBreak = static_cast<int>(exitingBlks.size());
+  numLoopbreakPatternMatch += numBreak;
+  numClonedBlock += numCloned;
+  return numBreak + numSerial + numCloned;
+} //loopbreakPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
+                                                 BlockT *loopHeader) {
+  int numCont = 0;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
+  for (typename InvBlockGTraits::ChildIteratorType iter =
+       InvBlockGTraits::child_begin(loopHeader),
+       iterEnd = InvBlockGTraits::child_end(loopHeader);
+       iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    if (loopRep->contains(curBlk)) {
+      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
+                          loopHeader, loopRep);
+      contBlk.push_back(curBlk);
+      ++numCont;
+    }
+  }
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
+       iter = contBlk.begin(), iterEnd = contBlk.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->removeSuccessor(loopHeader);
+  }
+
+  numLoopcontPatternMatch += numCont;
+
+  return numCont;
+} //loopcontPatternMatch
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
+                                                         BlockT *src2Blk) {
+  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
+  // same loop with LoopLandInfo without explicitly keeping track of
+  // loopContBlks and loopBreakBlks, this is a method to get the information.
+  //
+  if (src1Blk->succ_size() == 0) {
+    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
+    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
+      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+      if (theEntry != NULL) {
+        if (DEBUGME) {
+          errs() << "isLoopContBreakBlock yes src1 = BB"
+                 << src1Blk->getNumber()
+                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}  //isSameloopDetachedContbreak
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
+                                             BlockT *trueBlk,
+                                             BlockT *falseBlk) {
+  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
+  if (num == 0) {
+    if (DEBUGME) {
+      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    }
+    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+  }
+  return num;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
+                                                BlockT *trueBlk,
+                                                BlockT *falseBlk) {
+  int num = 0;
+  BlockT *downBlk;
+
+  //trueBlk could be the common post dominator
+  downBlk = trueBlk;
+
+  if (DEBUGME) {
+    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
+           << " true = BB" << trueBlk->getNumber()
+           << ", numSucc=" << trueBlk->succ_size()
+           << " false = BB" << falseBlk->getNumber() << "\n";
+  }
+
+  while (downBlk) {
+    if (DEBUGME) {
+      errs() << "check down = BB" << downBlk->getNumber();
+    }
+
+    if (//postDomTree->dominates(downBlk, falseBlk) &&
+        singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+      if (DEBUGME) {
+        errs() << " working\n";
+      }
+
+      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
+      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+
+      numClonedBlock += num;
+      num += serialPatternMatch(*headBlk->succ_begin());
+      num += serialPatternMatch(*(++headBlk->succ_begin()));
+      num += ifPatternMatch(headBlk);
+      assert(num > 0); //
+
+      break;
+    }
+    if (DEBUGME) {
+      errs() << " not working\n";
+    }
+    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
+
+  return num;
+} //handleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
+                                                         BlockT *trueBlk,
+                                                         BlockT *falseBlk,
+                                                         BlockT *landBlk,
+                                                         bool detail) {
+  errs() << "head = BB" << headBlk->getNumber()
+         << " size = " << headBlk->size();
+  if (detail) {
+    errs() << "\n";
+    headBlk->print(errs());
+    errs() << "\n";
+  }
+
+  if (trueBlk) {
+    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
+           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      trueBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (falseBlk) {
+    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
+           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      falseBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (landBlk) {
+    errs() << ", land = BB" << landBlk->getNumber() << " size = "
+           << landBlk->size() << " numPred = " << landBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      landBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+
+    errs() << "\n";
+} //showImproveSimpleJumpintoIf
+
+template<class PassT>
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
+                                                    BlockT *trueBlk,
+                                                    BlockT *falseBlk,
+                                                    BlockT **plandBlk) {
+  bool migrateTrue = false;
+  bool migrateFalse = false;
+
+  BlockT *landBlk = *plandBlk;
+
+  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
+         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+
+  if (trueBlk == falseBlk) {
+    return 0;
+  }
+
+#if 0
+  if (DEBUGME) {
+    errs() << "improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+#endif
+
+  // unsigned landPredSize = landBlk ? landBlk->pred_size() : 0;
+  // May consider the # landBlk->pred_size() as it represents the number of
+  // assignment initReg = .. needed to insert.
+  migrateTrue = needMigrateBlock(trueBlk);
+  migrateFalse = needMigrateBlock(falseBlk);
+
+  if (!migrateTrue && !migrateFalse) {
+    return 0;
+  }
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
+    migrateTrue = true;
+  }
+  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
+    migrateFalse = true;
+  }
+
+  if (DEBUGME) {
+    errs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+  unsigned initReg =
+    funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+  if (!migrateTrue || !migrateFalse) {
+    int initVal = migrateTrue ? 0 : 1;
+    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  }
+
+  int numNewBlk = 0;
+
+  if (landBlk == NULL) {
+    landBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(landBlk);  //insert to function
+
+    if (trueBlk) {
+      trueBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    if (falseBlk) {
+      falseBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    numNewBlk ++;
+  }
+
+  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+
+  //insert AMDIL::ENDIF to avoid special case "input landBlk == NULL"
+  typename BlockT::iterator insertPos =
+    CFGTraits::getInstrPos
+    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDIL::ENDIF, passRep));
+
+  if (landBlkHasOtherPred) {
+    unsigned immReg =
+      funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
+    unsigned cmpResReg =
+      funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+
+    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
+                                        initReg, immReg);
+    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
+                                      AMDIL::IF_LOGICALZ_i32, passRep,
+                                      cmpResReg, DebugLoc());
+  }
+
+  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDIL::IF_LOGICALNZ_i32,
+                                    passRep, initReg, DebugLoc());
+
+  if (migrateTrue) {
+    migrateInstruction(trueBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+  }
+  CFGTraits::insertInstrBefore(insertPos, AMDIL::ELSE, passRep);
+
+  if (migrateFalse) {
+    migrateInstruction(falseBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+  }
+  //CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep);
+
+  if (landBlkHasOtherPred) {
+    // add endif
+    CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
+         ++predIter) {
+      BlockT *curBlk = *predIter;
+      if (curBlk != trueBlk && curBlk != falseBlk) {
+        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
+      }
+    } //for
+  }
+  if (DEBUGME) {
+    errs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // update landBlk
+  *plandBlk = landBlk;
+
+  return numNewBlk;
+} //improveSimpleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
+                                              LoopT *exitingLoop,
+                                             BlockT *exitBlk,
+                                              LoopT *exitLoop,
+                                             BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
+           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  if (exitingLoop != exitLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    assert(initReg != INVALIDREGNUM);
+    addLoopBreakInitReg(exitLoop, initReg);
+    while (exitingLoop != exitLoop && exitingLoop) {
+      addLoopBreakOnReg(exitingLoop, initReg);
+      exitingLoop = exitingLoop->getParentLoop();
+    }
+    assert(exitingLoop == exitLoop);
+  }
+
+  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+
+} //handleLoopbreak
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
+                                                  LoopT *contingLoop,
+                                                 BlockT *contBlk,
+                                                  LoopT *contLoop) {
+  if (DEBUGME) {
+    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
+           << " header = BB" << contBlk->getNumber() << "\n";
+
+    errs() << "Trying to continue loop-depth = "
+           << getLoopDepth(contLoop)
+           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  if (contingLoop != contLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    assert(initReg != INVALIDREGNUM);
+    addLoopContInitReg(contLoop, initReg);
+    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
+      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
+      contingLoop = contingLoop->getParentLoop();
+    }
+    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
+    addLoopContOnReg(contingLoop, initReg);
+  }
+
+  settleLoopcontBlock(contingBlk, contBlk, initReg);
+  //contingBlk->removeSuccessor(loopHeader);
+} //handleLoopcontBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+    errs() << "serialPattern BB" << dstBlk->getNumber()
+           << " <= BB" << srcBlk->getNumber() << "\n";
+  }
+  //removeUnconditionalBranch(dstBlk);
+  dstBlk->splice(dstBlk->end(), srcBlk, FirstNonDebugInstr(srcBlk), srcBlk->end());
+
+  dstBlk->removeSuccessor(srcBlk);
+  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+
+  removeSuccessor(srcBlk);
+  retireBlock(dstBlk, srcBlk);
+} //mergeSerialBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
+                                                  BlockT *curBlk,
+                                                  BlockT *trueBlk,
+                                                  BlockT *falseBlk,
+                                                  BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "ifPattern BB" << curBlk->getNumber();
+    errs() << "{  ";
+    if (trueBlk) {
+      errs() << "BB" << trueBlk->getNumber();
+    }
+    errs() << "  } else ";
+    errs() << "{  ";
+    if (falseBlk) {
+      errs() << "BB" << falseBlk->getNumber();
+    }
+    errs() << "  }\n ";
+    errs() << "landBlock: ";
+    if (landBlk == NULL) {
+      errs() << "NULL";
+    } else {
+      errs() << "BB" << landBlk->getNumber();
+    }
+    errs() << "\n";
+  }
+
+  int oldOpcode = branchInstr->getOpcode();
+  DebugLoc branchDL = branchInstr->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(curBlk, branchInstr);
+  CFGTraits::insertCondBranchBefore(branchInstrPos,
+                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
+                                    passRep,
+									branchDL);
+
+  if (trueBlk) {
+    curBlk->splice(branchInstrPos, trueBlk, FirstNonDebugInstr(trueBlk), trueBlk->end());
+    curBlk->removeSuccessor(trueBlk);
+    if (landBlk && trueBlk->succ_size()!=0) {
+      trueBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, trueBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ELSE, passRep);
+
+  if (falseBlk) {
+    curBlk->splice(branchInstrPos, falseBlk, FirstNonDebugInstr(falseBlk),
+                   falseBlk->end());
+    curBlk->removeSuccessor(falseBlk);
+    if (landBlk && falseBlk->succ_size() != 0) {
+      falseBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, falseBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep);
+
+  //curBlk->remove(branchInstrPos);
+  branchInstr->eraseFromParent();
+
+  if (landBlk && trueBlk && falseBlk) {
+    curBlk->addSuccessor(landBlk);
+  }
+
+} //mergeIfthenelseBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
+                                                LoopLandInfo *loopLand) {
+  BlockT *landBlk = loopLand->landBlk;
+
+  if (DEBUGME) {
+    errs() << "loopPattern header = BB" << dstBlk->getNumber()
+           << " land = BB" << landBlk->getNumber() << "\n";
+  }
+
+  // Loop contInitRegs are init at the beginning of the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->contInitRegs.begin(),
+       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the
+   * BREAK_LOGICALZ_i32 or AMDIL::BREAK_LOGICALNZ statement in the current dstBlk.
+   * search for the DebugLoc in the that statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
+  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrBefore(dstBlk, AMDIL::WHILELOOP, passRep, DLBreak);
+  // Loop breakInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakInitRegs.begin(),
+       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter)
+  {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+  // Loop endbranchInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->endbranchInitRegs.begin(),
+       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+   * search for the DebugLoc in the continue statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
+  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrEnd(dstBlk, AMDIL::ENDLOOP, passRep, DLContinue);
+  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
+  // loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakOnRegs.begin(),
+       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::BREAK_LOGICALNZ_i32, passRep,
+                                   *iter);
+  }
+
+  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
+  // loop.
+  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
+       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::CONTINUE_LOGICALNZ_i32,
+                                   passRep, *iter);
+  }
+
+  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+
+  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
+       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
+    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
+  }
+
+  removeSuccessor(landBlk);
+  retireBlock(dstBlk, landBlk);
+} //mergeLooplandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
+                                                 BlockT *exitBlk,
+                                                 BlockT *exitLandBlk,
+                                                 RegiT  setReg) {
+  if (DEBUGME) {
+    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
+           << " exit = BB" << exitBlk->getNumber()
+           << " land = BB" << exitLandBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
+  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+
+  DebugLoc DL = branchInstr->getDebugLoc();
+
+  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+  int oldOpcode = branchInstr->getOpcode();
+
+  //    transform exitingBlk to
+  //    if ( ) {
+  //       exitBlk (if exitBlk != exitLandBlk)
+  //       setReg = 1
+  //       break
+  //    }endif
+  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(exitingBlk, branchInstr);
+
+  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
+    //break_logical
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBreakNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBreakZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+  } else {
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBranchZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+    if (exitBlk != exitLandBlk) {
+      //splice is insert-before ...
+      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
+                         exitBlk->end());
+    }
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+    }
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::BREAK, passRep);
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep);
+  } //if_logical
+
+  //now branchInst can be erase safely
+  //exitingBlk->eraseFromParent(branchInstr);
+  branchInstr->eraseFromParent();
+
+  //now take care of successors, retire blocks
+  exitingBlk->removeSuccessor(exitBlk);
+  if (exitBlk != exitLandBlk) {
+    //splice is insert-before ...
+    exitBlk->removeSuccessor(exitLandBlk);
+    retireBlock(exitingBlk, exitBlk);
+  }
+
+} //mergeLoopbreakBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
+                                                 BlockT *contBlk,
+                                                 RegiT   setReg) {
+  if (DEBUGME) {
+    errs() << "settleLoopcontBlock conting = BB"
+           << contingBlk->getNumber()
+           << ", cont = BB" << contBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
+  if (branchInstr) {
+    assert(CFGTraits::isCondBranch(branchInstr));
+    typename BlockT::iterator branchInstrPos =
+      CFGTraits::getInstrPos(contingBlk, branchInstr);
+    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+    int oldOpcode = branchInstr->getOpcode();
+	DebugLoc DL = branchInstr->getDebugLoc();
+
+    //    transform contingBlk to
+    //     if () {
+    //          move instr after branchInstr
+    //          continue
+    //        or
+    //          setReg = 1
+    //          break
+    //     }endif
+    //     successor = {orgSuccessor(contingBlk) - loopHeader}
+
+    bool useContinueLogical = 
+      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
+
+    if (useContinueLogical == false) 
+    {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+
+      if (setReg != INVALIDREGNUM) {
+        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, DL);
+      } else {
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, DL);
+      }
+
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::ENDIF, passRep, DL);
+    } else {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
+                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+    }
+
+    //contingBlk->eraseFromParent(branchInstr);
+    branchInstr->eraseFromParent();
+  } else {
+    /* if we've arrived here then we've already erased the branch instruction
+	 * travel back up the basic block to see the last reference of our debug location
+	 * we've just inserted that reference here so it should be representative */
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    } else {
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    }
+  } //else
+
+} //settleLoopcontBlock
+
+// BBs in exitBlkSet are determined as in break-path for loopRep,
+// before we can put code for BBs as inside loop-body for loopRep
+// check whether those BBs are determined as cont-BB for parentLoopRep
+// earlier.
+// If so, generate a new BB newBlk
+//    (1) set newBlk common successor of BBs in exitBlkSet
+//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
+//    (3) generate continue-instr in newBlk
+//
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
+                                              LoopT *loopRep,
+                                              std::set<BlockT *> &exitBlkSet,
+                                              BlockT *exitLandBlk) {
+  std::set<BlockT *> endBlkSet;
+
+//  BlockT *parentLoopHead = parentLoopRep->getHeader();
+
+
+  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
+       iterEnd = exitBlkSet.end();
+       iter != iterEnd; ++iter) {
+    BlockT *exitBlk = *iter;
+    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
+
+    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
+      return NULL;
+
+    endBlkSet.insert(endBlk);
+  }
+
+  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(newBlk);  //insert to function
+  CFGTraits::insertInstrEnd(newBlk, AMDIL::CONTINUE, passRep);
+  SHOWNEWBLK(newBlk, "New continue block: ");
+
+  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
+       iterEnd = endBlkSet.end();
+       iter != iterEnd; ++iter) {
+      BlockT *endBlk = *iter;
+      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
+      if (contInstr) {
+        contInstr->eraseFromParent();
+      }
+      endBlk->addSuccessor(newBlk);
+      if (DEBUGME) {
+        errs() << "Add new continue Block to BB"
+               << endBlk->getNumber() << " successors\n";
+      }
+  }
+
+  return newBlk;
+} //relocateLoopcontBlock
+
+
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
+// pathes corresponding to the loop exiting branches.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
+                                              BlockTSmallerVector &exitingBlks,
+                                              BlockTSmallerVector &exitBlks) {
+  const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+  RegiT endBranchReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+  assert(endBranchReg >= 0);
+
+  // reg = 0 before entering the loop
+  addLoopEndbranchInitReg(loopRep, endBranchReg);
+
+  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
+  assert(numBlks >=2 && numBlks == exitBlks.size());
+
+  BlockT *preExitingBlk = exitingBlks[0];
+  BlockT *preExitBlk = exitBlks[0];
+  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(preBranchBlk);  //insert to function
+  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
+
+  BlockT *newLandBlk = preBranchBlk;
+
+      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
+        newLandBlk);
+  preExitingBlk->removeSuccessor(preExitBlk);
+  preExitingBlk->addSuccessor(newLandBlk);
+
+  //it is redundant to add reg = 0 to exitingBlks[0]
+
+  // For 1..n th exiting path (the last iteration handles two pathes) create the
+  // branch to the previous path and the current path.
+  for (uint32_t i = 1; i < numBlks; ++i) {
+    BlockT *curExitingBlk = exitingBlks[i];
+    BlockT *curExitBlk = exitBlks[i];
+    BlockT *curBranchBlk;
+
+    if (i == numBlks - 1) {
+      curBranchBlk = curExitBlk;
+    } else {
+      curBranchBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(curBranchBlk);  //insert to function
+      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
+    }
+
+    // Add reg = i to exitingBlks[i].
+    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
+                                       endBranchReg, i);
+
+    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
+    // (exitingBlks[i], newLandBlk).
+    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
+                                          newLandBlk);
+    curExitingBlk->removeSuccessor(curExitBlk);
+    curExitingBlk->addSuccessor(newLandBlk);
+
+    // add to preBranchBlk the branch instruction:
+    // if (endBranchReg == preVal)
+    //    preExitBlk
+    // else
+    //    curBranchBlk
+    //
+    // preValReg = i - 1
+
+  DebugLoc DL;
+  RegiT preValReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+  BuildMI(preBranchBlk, DL, tii->get(AMDIL::LOADCONST_i32), preValReg)
+    .addImm(i - 1); //preVal
+
+  // condResReg = (endBranchReg == preValReg)
+    RegiT condResReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    BuildMI(preBranchBlk, DL, tii->get(AMDIL::IEQ), condResReg)
+      .addReg(endBranchReg).addReg(preValReg);
+
+    BuildMI(preBranchBlk, DL, tii->get(AMDIL::BRANCH_COND_i32))
+      .addMBB(preExitBlk).addReg(condResReg);
+
+    preBranchBlk->addSuccessor(preExitBlk);
+    preBranchBlk->addSuccessor(curBranchBlk);
+
+    // Update preExitingBlk, preExitBlk, preBranchBlk.
+    preExitingBlk = curExitingBlk;
+    preExitBlk = curExitBlk;
+    preBranchBlk = curBranchBlk;
+
+  }  //end for 1 .. n blocks
+
+  return newLandBlk;
+} //addLoopEndbranchBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::PathToKind
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
+                                     bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return SinglePath_InPath;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == dstBlk) {
+      return SinglePath_InPath;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return Not_SinglePath;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return SinglePath_NotInPath;
+  }
+
+  return Not_SinglePath;
+} //singlePathTo
+
+// If there is a single path from srcBlk to dstBlk, return the last block before
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
+// last block in the path Otherwise, return NULL
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
+                                      bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return srcBlk;
+  }
+
+  if (srcBlk->succ_size() == 0) {
+    return srcBlk;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    BlockT *preBlk = srcBlk;
+
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == NULL) {
+      return preBlk;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return NULL;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return srcBlk;
+  }
+
+  return NULL;
+
+} //singlePathEnd
+
+template<class PassT>
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
+                                               BlockT *dstBlk) {
+  int cloned = 0;
+  assert(preBlk->isSuccessor(srcBlk));
+  while (srcBlk && srcBlk != dstBlk) {
+    assert(srcBlk->succ_size() == 1);
+    if (srcBlk->pred_size() > 1) {
+      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
+      ++cloned;
+    }
+
+    preBlk = srcBlk;
+    srcBlk = *srcBlk->succ_begin();
+  }
+
+  return cloned;
+} //cloneOnSideEntryTo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
+                                                 BlockT *predBlk) {
+  assert(predBlk->isSuccessor(curBlk) &&
+         "succBlk is not a prececessor of curBlk");
+
+  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
+  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  //srcBlk, oldBlk, newBlk
+
+  predBlk->removeSuccessor(curBlk);
+  predBlk->addSuccessor(cloneBlk);
+
+  // add all successor to cloneBlk
+  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+
+  numClonedInstr += curBlk->size();
+
+  if (DEBUGME) {
+    errs() << "Cloned block: " << "BB"
+           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  }
+
+  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+
+  return cloneBlk;
+} //cloneBlockForPredecessor
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
+                                               BlockT *exitingBlk) {
+  BlockT *exitBlk = NULL;
+
+  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
+       iterSuccEnd = exitingBlk->succ_end();
+       iterSucc != iterSuccEnd; ++iterSucc) {
+    BlockT *curBlk = *iterSucc;
+    if (!loopRep->contains(curBlk)) {
+      assert(exitBlk == NULL);
+      exitBlk = curBlk;
+    }
+  }
+
+  assert(exitBlk != NULL);
+
+  return exitBlk;
+} //exitingBlock2ExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
+                                                BlockT *dstBlk,
+                                                InstrIterator insertPos) {
+  InstrIterator spliceEnd;
+  //look for the input branchinstr, not the AMDIL branchinstr
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+  if (branchInstr == NULL) {
+    if (DEBUGME) {
+      errs() << "migrateInstruction don't see branch instr\n" ;
+    }
+    spliceEnd = srcBlk->end();
+  } else {
+    if (DEBUGME) {
+      errs() << "migrateInstruction see branch instr\n" ;
+      branchInstr->dump();
+    }
+    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+  }
+  if (DEBUGME) {
+    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+
+  //splice insert before insertPos
+  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+
+  if (DEBUGME) {
+    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+} //migrateInstruction
+
+// normalizeInfiniteLoopExit change
+//   B1:
+//        uncond_br LoopHeader
+//
+// to
+//   B1:
+//        cond_br 1 LoopHeader dummyExit
+// and return the newly added dummy exit block
+// 
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
+  BlockT *loopHeader;
+  BlockT *loopLatch;
+  loopHeader = LoopRep->getHeader();
+  loopLatch = LoopRep->getLoopLatch();
+  BlockT *dummyExitBlk = NULL;
+  if (loopHeader!=NULL && loopLatch!=NULL) {
+    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
+    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
+      dummyExitBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(dummyExitBlk);  //insert to function
+      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+
+      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
+
+      typename BlockT::iterator insertPos =
+        CFGTraits::getInstrPos(loopLatch, branchInstr);
+      unsigned immReg =
+        funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
+      InstrT *newInstr = 
+        CFGTraits::insertInstrBefore(insertPos, AMDIL::BRANCH_COND_i32, passRep);
+      MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
+
+      SHOWNEWINSTR(newInstr);
+
+      branchInstr->eraseFromParent();
+      loopLatch->addSuccessor(dummyExitBlk);
+    }
+  }
+
+  return dummyExitBlk;
+} //normalizeInfiniteLoopExit
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
+  InstrT *branchInstr;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
+          && CFGTraits::isUncondBranch(branchInstr)) {
+    if (DEBUGME) {
+          errs() << "Removing unconditional branch instruction" ;
+      branchInstr->dump();
+    }
+    branchInstr->eraseFromParent();
+  }
+} //removeUnconditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
+  if (srcBlk->succ_size() == 2) {
+    BlockT *blk1 = *srcBlk->succ_begin();
+    BlockT *blk2 = *(++srcBlk->succ_begin());
+
+    if (blk1 == blk2) {
+      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+      if (DEBUGME) {
+        errs() << "Removing unneeded conditional branch instruction" ;
+        branchInstr->dump();
+      }
+      branchInstr->eraseFromParent();
+      SHOWNEWBLK(blk1, "Removing redundant successor");
+      srcBlk->removeSuccessor(blk1);
+    }
+  }
+} //removeRedundantConditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
+                                               DEFAULT_VEC_SLOTS> &retBlks) {
+  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(dummyExitBlk);  //insert to function
+  CFGTraits::insertInstrEnd(dummyExitBlk, AMDIL::RETURN, passRep);
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
+         retBlks.begin(),
+       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
+    if (curInstr) {
+      curInstr->eraseFromParent();
+    }
+#if 0
+    if (curBlk->size()==0 && curBlk->pred_size() == 1) {
+      if (DEBUGME) {
+        errs() << "Replace empty block BB" <<  curBlk->getNumber()
+          << " with dummyExitBlock\n";
+      }
+      BlockT *predb = *curBlk->pred_begin();
+      predb->removeSuccessor(curBlk);
+      curBlk = predb;
+    } //handle empty curBlk
+#endif
+    curBlk->addSuccessor(dummyExitBlk);
+    if (DEBUGME) {
+      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+             << " successors\n";
+    }
+  } //for
+
+  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
+} //addDummyExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
+  while (srcBlk->succ_size()) {
+    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->sccNum = sccNum;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+  }
+
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->isRetired = true;
+  //int i = srcBlk->succ_size();
+  //int j = srcBlk->pred_size();
+  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+         && "can't retire block yet");
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return (srcBlkInfo && srcBlkInfo->isRetired);
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+
+    if(loopLand == NULL)
+      return true;
+
+    BlockT *landBlk = loopLand->landBlk;
+    assert(landBlk);
+    if (!isRetiredBlock(landBlk)) {
+      return true;
+    }
+
+    loopRep = loopRep->getParentLoop();
+  }
+
+  return false;
+} //isActiveLoophead
+
+template<class PassT>
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
+  const unsigned blockSizeThreshold = 30;
+  const unsigned cloneInstrThreshold = 100;
+
+  bool multiplePreds = blk && (blk->pred_size() > 1);
+
+  if(!multiplePreds)
+    return false;
+
+  unsigned blkSize = blk->size();
+  return ((blkSize > blockSizeThreshold)
+          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
+} //needMigrateBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
+                                            BlockTSmallerVector &exitBlks,
+                                            std::set<BlockT *> &exitBlkSet) {
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
+
+  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+       predIterEnd = landBlk->pred_end();
+       predIter != predIterEnd; ++predIter) {
+    BlockT *curBlk = *predIter;
+    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
+      inpathBlks.push_back(curBlk);
+    }
+  } //for
+
+  //if landBlk has predecessors that are not in the given loop,
+  //create a new block
+  BlockT *newLandBlk = landBlk;
+  if (inpathBlks.size() != landBlk->pred_size()) {
+    newLandBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(newLandBlk);  //insert to function
+    newLandBlk->addSuccessor(landBlk);
+    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
+         inpathBlks.begin(),
+         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
+      BlockT *curBlk = *iter;
+      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
+      //srcBlk, oldBlk, newBlk
+      curBlk->removeSuccessor(landBlk);
+      curBlk->addSuccessor(newLandBlk);
+    }
+    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
+      if (exitBlks[i] == landBlk) {
+        exitBlks[i] = newLandBlk;
+      }
+    }
+    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
+  }
+
+  setLoopLandBlock(loopRep, newLandBlk);
+
+  return newLandBlk;
+} // recordLoopbreakLand
+
+template<class PassT>
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  assert(theEntry->landBlk == NULL);
+
+  if (blk == NULL) {
+    blk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(blk);  //insert to function
+    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
+  }
+
+  theEntry->landBlk = blk;
+
+  if (DEBUGME) {
+    errs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << blk->getNumber() << "\n";
+  }
+} // setLoopLandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+
+  theEntry->breakOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->breakInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
+                                                     RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->endbranchInitRegs.insert(regNum);
+
+  if (DEBUGME)
+  {
+        errs() << "addLoopEndbranchInitReg loop-header = BB"
+      << loopRep->getHeader()->getNumber()
+      << "  regNum = " << regNum << "\n";
+  }
+} // addLoopEndbranchInitReg
+
+template<class PassT>
+typename CFGStructurizer<PassT>::LoopLandInfo *
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry;
+} // getLoopLandInfo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry ? theEntry->landBlk : NULL;
+} // getLoopLandBlock
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  if (loopRep == NULL)
+    return false;
+
+  BlockT *loopHeader = loopRep->getHeader();
+
+  return curBlk->isSuccessor(loopHeader);
+
+} //hasBackEdge
+
+template<class PassT>
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
+  return loopRep ? loopRep->getLoopDepth() : 0;
+} //getLoopDepth
+
+template<class PassT>
+int CFGStructurizer<PassT>::countActiveBlock
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
+  int count = 0;
+  while (iterStart != iterEnd) {
+    if (!isRetiredBlock(*iterStart)) {
+      ++count;
+    }
+    ++iterStart;
+  }
+
+  return count;
+} //countActiveBlock
+
+// This is work around solution for findNearestCommonDominator not avaiable to
+// post dom a proper fix should go to Dominators.h.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT*
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+
+  if (postDomTree->dominates(blk1, blk2)) {
+    return blk1;
+  }
+  if (postDomTree->dominates(blk2, blk1)) {
+    return blk2;
+  }
+
+  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
+  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+
+  // Handle newly cloned node.
+  if (node1 == NULL && blk1->succ_size() == 1) {
+    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
+  }
+  if (node2 == NULL && blk2->succ_size() == 1) {
+    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
+  }
+
+  if (node1 == NULL || node2 == NULL) {
+    return NULL;
+  }
+
+  node1 = node1->getIDom();
+  while (node1) {
+    if (postDomTree->dominates(node1, node2)) {
+      return node1->getBlock();
+    }
+    node1 = node1->getIDom();
+  }
+
+  return NULL;
+}
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::findNearestCommonPostDom
+(typename std::set<BlockT *> &blks) {
+  BlockT *commonDom;
+  typename std::set<BlockT *>::const_iterator iter = blks.begin();
+  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
+  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
+    BlockT *curBlk = *iter;
+    if (curBlk != commonDom) {
+      commonDom = findNearestCommonPostDom(curBlk, commonDom);
+    }
+  }
+
+  if (DEBUGME) {
+    errs() << "Common post dominator for exit blocks is ";
+    if (commonDom) {
+          errs() << "BB" << commonDom->getNumber() << "\n";
+    } else {
+      errs() << "NULL\n";
+    }
+  }
+
+  return commonDom;
+} //findNearestCommonPostDom
+
+} //end namespace llvm
+
+//todo: move-end
+
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer for AMDIL
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGStructurizer : public MachineFunctionPass
+{
+public:
+  typedef MachineInstr              InstructionType;
+  typedef MachineFunction           FunctionType;
+  typedef MachineBasicBlock         BlockType;
+  typedef MachineLoopInfo           LoopinfoType;
+  typedef MachineDominatorTree      DominatortreeType;
+  typedef MachinePostDominatorTree  PostDominatortreeType;
+  typedef MachineDomTreeNode        DomTreeNodeType;
+  typedef MachineLoop               LoopType;
+//private:
+  TargetMachine &TM;
+  const TargetInstrInfo *TII;
+
+//public:
+//  static char ID;
+
+public:
+  AMDILCFGStructurizer(char &pid, TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  const TargetInstrInfo *getTargetInstrInfo() const;
+  //bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGStructurizer
+
+//char AMDILCFGStructurizer::ID = 0;
+} //end of namespace llvm
+AMDILCFGStructurizer::AMDILCFGStructurizer(char &pid, TargetMachine &tm
+                                           AMDIL_OPT_LEVEL_DECL)
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()) {
+}
+
+const TargetInstrInfo *AMDILCFGStructurizer::getTargetInstrInfo() const {
+  return TII;
+}
+//===----------------------------------------------------------------------===//
+//
+// CFGPrepare
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPrepare : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPrepare
+
+char AMDILCFGPrepare::ID = 0;
+} //end of namespace llvm
+
+AMDILCFGPrepare::AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : AMDILCFGStructurizer(ID, tm  AMDIL_OPT_LEVEL_VAR) 
+{
+}
+const char *AMDILCFGPrepare::getPassName() const {
+  return "AMD IL Control Flow Graph Preparation Pass";
+}
+
+void AMDILCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGPerform
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPerform : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPerform
+
+char AMDILCFGPerform::ID = 0;
+} //end of namespace llvm
+
+  AMDILCFGPerform::AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+: AMDILCFGStructurizer(ID, tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+const char *AMDILCFGPerform::getPassName() const {
+  return "AMD IL Control Flow Graph structurizer Pass";
+}
+
+void AMDILCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructTraits<AMDILCFGStructurizer>
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// this class is tailor to the AMDIL backend
+template<>
+struct CFGStructTraits<AMDILCFGStructurizer>
+{
+  typedef int RegiT;
+
+  static int getBreakNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBreakZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueNzeroOpcode(int oldOpcode)
+  {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALNZ);
+      default:
+        assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+// the explicitly represented branch target is the true branch target
+#define getExplicitBranch getTrueBranch
+#define setExplicitBranch setTrueBranch
+
+  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
+    return instr->getOperand(0).getMBB();
+  }
+
+  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
+    instr->getOperand(0).setMBB(blk);
+  }
+
+  static MachineBasicBlock *
+  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(blk->succ_size() == 2);
+    MachineBasicBlock *trueBranch = getTrueBranch(instr);
+    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
+    MachineBasicBlock::succ_iterator iterNext = iter;
+    ++iterNext;
+
+    return (*iter == trueBranch) ? *iterNext : *iter;
+  }
+
+  static bool isCondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isUncondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+    case AMDIL::BRANCH:
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isPhimove(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      ExpandCaseToAllTypes(AMDIL::MOVE);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
+    //get DebugLoc from the first MachineBasicBlock instruction with debug info
+    DebugLoc DL;
+	for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
+	  MachineInstr *instr = &(*iter);
+	  if (instr->getDebugLoc().isUnknown() == false) {
+	    DL = instr->getDebugLoc();
+	  }
+    }
+    return DL;
+  }
+
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    MachineInstr *instr = &*iter;
+    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
+      return instr;
+    }
+    return NULL;
+  }
+
+  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  //
+  // BB with backward-edge could have move instructions after the branch
+  // instruction.  Such move instruction "belong to" the loop backward-edge.
+  //
+  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
+         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
+      // FIXME: Simplify
+      MachineInstr *instr = &*iter;
+      if (instr) {
+        if (isCondBranch(instr) || isUncondBranch(instr)) {
+          return instr;
+        } else if (!isPhimove(instr)) {
+          break;
+        }
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDIL::RETURN) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDIL::CONTINUE) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if ((instr->getOpcode() == AMDIL::BREAK_LOGICALNZ_i32) || (instr->getOpcode() == AMDIL::BREAK_LOGICALZ_i32)) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static bool isReturnBlock(MachineBasicBlock *blk) {
+    MachineInstr *instr = getReturnInstr(blk);
+    bool isReturn = (blk->succ_size() == 0);
+    if (instr) {
+      assert(isReturn);
+    } else if (isReturn) {
+      if (DEBUGME) {
+        errs() << "BB" << blk->getNumber()
+               <<" is return block without RETURN instr\n";
+      }
+    }
+
+    return  isReturn;
+  }
+
+  static MachineBasicBlock::iterator
+  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(instr->getParent() == blk && "instruction doesn't belong to block");
+    MachineBasicBlock::iterator iter = blk->begin();
+    MachineBasicBlock::iterator iterEnd = blk->end();
+    while (&(*iter) != instr && iter != iterEnd) {
+      ++iter;
+    }
+
+    assert(iter != iterEnd);
+    return iter;
+  }//getInstrPos
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep) {
+    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrBefore
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    MachineBasicBlock::iterator res;
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep) {
+    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrEnd
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+   MachineInstr *newInstr = blk->getParent()
+      ->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->push_back(newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+  } //insertInstrEnd
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
+                                         int newOpcode, 
+                                         AMDILCFGStructurizer *passRep) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DebugLoc());
+
+    blk->insert(instrPos, newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+									 DebugLoc DL) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DL);
+
+    blk->insert(instrPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
+                                         false);
+
+    SHOWNEWINSTR(newInstr);
+    //erase later oldInstr->eraseFromParent();
+  } //insertCondBranchBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock *blk,
+                                     MachineBasicBlock::iterator insertPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+                                     RegiT regNum,
+									 DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    //insert before
+    blk->insert(insertPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchBefore
+
+  static void insertCondBranchEnd(MachineBasicBlock *blk,
+                                  int newOpcode,
+                                  AMDILCFGStructurizer *passRep,
+                                  RegiT regNum) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
+
+    blk->push_back(newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchEnd
+
+
+  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32),
+                                           DebugLoc());
+    MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addImm(regVal); //set src value
+
+    blk->insert(instrPos, newInstr);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertAssignInstrBefore
+
+  static void insertAssignInstrBefore(MachineBasicBlock *blk,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32),
+                                           DebugLoc());
+    MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addImm(regVal); //set src value
+
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+  } //insertInstrBefore
+
+  static void insertCompareInstrBefore(MachineBasicBlock *blk,
+                                       MachineBasicBlock::iterator instrPos,
+                                       AMDILCFGStructurizer *passRep,
+                                       RegiT dstReg, RegiT src1Reg,
+                                       RegiT src2Reg) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::IEQ), DebugLoc());
+
+    MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
+    MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
+
+    blk->insert(instrPos, newInstr);
+    SHOWNEWINSTR(newInstr);
+
+  } //insertCompareInstrBefore
+
+  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
+                                 MachineBasicBlock *srcBlk) {
+    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
+         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
+      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
+    }
+  } //cloneSuccessorList
+
+  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
+    MachineFunction *func = srcBlk->getParent();
+    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
+    func->push_back(newBlk);  //insert to function
+    //newBlk->setNumber(srcBlk->getNumber());
+    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
+         iterEnd = srcBlk->end();
+         iter != iterEnd; ++iter) {
+      MachineInstr *instr = func->CloneMachineInstr(iter);
+      // This is a workaround for LLVM bugzilla 8420 because CloneMachineInstr
+      // does not clone the AsmPrinterFlags.
+      instr->setAsmPrinterFlag(
+         (llvm::MachineInstr::CommentFlag)iter->getAsmPrinterFlags());
+      newBlk->push_back(instr);
+    }
+    return newBlk;
+  }
+
+  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
+  //the AMDIL instruction is not recognized as terminator fix this and retire
+  //this routine
+  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
+                                         MachineBasicBlock *oldBlk,
+                                         MachineBasicBlock *newBlk) {
+    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
+    if (branchInstr && isCondBranch(branchInstr) &&
+        getExplicitBranch(branchInstr) == oldBlk) {
+      setExplicitBranch(branchInstr, newBlk);
+    }
+  }
+
+  static void wrapup(MachineBasicBlock *entryBlk) {
+    assert((!entryBlk->getParent()->getJumpTableInfo()
+            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
+           && "found a jump table");
+
+     //collect continue right before endloop
+     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
+     MachineBasicBlock::iterator pre = entryBlk->begin();
+     MachineBasicBlock::iterator iterEnd = entryBlk->end();
+     MachineBasicBlock::iterator iter = pre;
+     while (iter != iterEnd) {
+       if (pre->getOpcode() == AMDIL::CONTINUE
+           && iter->getOpcode() == AMDIL::ENDLOOP) {
+         contInstr.push_back(pre);
+       }
+       pre = iter;
+       ++iter;
+     } //end while
+
+     //delete continue right before endloop
+     for (unsigned i = 0; i < contInstr.size(); ++i) {
+        contInstr[i]->eraseFromParent();
+     }
+
+     // TODO to fix up jump table so later phase won't be confused.  if
+     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+     // there isn't such an interface yet.  alternatively, replace all the other
+     // blocks in the jump table with the entryBlk //}
+
+  } //wrapup
+
+  static MachineDominatorTree *getDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineDominatorTree>();
+  }
+
+  static MachinePostDominatorTree*
+  getPostDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachinePostDominatorTree>();
+  }
+
+  static MachineLoopInfo *getLoopInfo(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineLoopInfo>();
+  }
+}; // template class CFGStructTraits
+} //end of namespace llvm
+
+// createAMDILCFGPreparationPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGPreparationPass(TargetMachine &tm
+                                                  AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPrepare(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPrepare::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().prepare(func,
+                                                                        *this);
+}
+
+// createAMDILCFGStructurizerPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGStructurizerPass(TargetMachine &tm
+                                                   AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPerform(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPerform::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().run(func,
+                                                                    *this);
+}
+
+//end of file newline goes below
+
diff --git a/lib/Target/AMDIL/AMDILCodeEmitter.h b/lib/Target/AMDIL/AMDILCodeEmitter.h
new file mode 100644
index 0000000..b0ea145
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILCodeEmitter.h
@@ -0,0 +1,46 @@
+//                     The LLVM Compiler Infrastructure
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+
+#ifndef AMDILCODEEMITTER_H
+#define AMDILCODEEMITTER_H
+
+namespace llvm {
+
+  /* XXX: Temp HACK to work around tablegen name generation */
+  class AMDILCodeEmitter {
+  public:
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+    virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                   const MachineOperand &MO) const { return 0; }
+    virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                   uint64_t Value) const {
+      return Value;
+    }
+    virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
+                                      unsigned OpNo) const {
+      return 0;
+    }
+  };
+
+} // End namespace llvm
+
+#endif // AMDILCODEEMITTER_H
diff --git a/lib/Target/AMDIL/AMDILMCCodeEmitter.cpp b/lib/Target/AMDIL/AMDILMCCodeEmitter.cpp
new file mode 100644
index 0000000..9366f2e
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILMCCodeEmitter.cpp
@@ -0,0 +1,158 @@
+//===---- AMDILMCCodeEmitter.cpp - Convert AMDIL text to AMDIL binary ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdil-emitter"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+#if 0
+namespace {
+  class AMDILMCCodeEmitter : public MCCodeEmitter {
+    AMDILMCCodeEmitter(const AMDILMCCodeEmitter &);// DO NOT IMPLEMENT
+    void operator=(const AMDILMCCodeEmitter &); // DO NOT IMPLEMENT
+    const TargetMachine &TM;
+    const TargetInstrInfo &TII;
+    MCContext &Ctx;
+    bool Is64BitMode;
+    public:
+    AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit);
+    ~AMDILMCCodeEmitter();
+    unsigned getNumFixupKinds() const;
+    const MCFixupKindInfo& getFixupKindInfo(MCFixupKind Kind) const;
+    static unsigned GetAMDILRegNum(const MCOperand &MO);
+    void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const;
+    void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+        raw_ostream &OS) const;
+    void EmitImmediate(const MCOperand &Disp, unsigned ImmSize,
+        MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &os,
+        SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
+
+    void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+        SmallVectorImpl<MCFixup> &Fixups) const;
+
+  }; // class AMDILMCCodeEmitter
+}; // anonymous namespace
+
+namespace llvm {
+  MCCodeEmitter *createAMDILMCCodeEmitter(const Target &,
+      TargetMachine &TM, MCContext &Ctx)
+  {
+    return new AMDILMCCodeEmitter(TM, Ctx, false);
+  }
+}
+
+AMDILMCCodeEmitter::AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx
+    , bool is64Bit)
+: TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx)
+{
+  Is64BitMode = is64Bit;
+}
+
+AMDILMCCodeEmitter::~AMDILMCCodeEmitter()
+{
+}
+
+unsigned
+AMDILMCCodeEmitter::getNumFixupKinds() const
+{
+  return 0;
+}
+
+const MCFixupKindInfo &
+AMDILMCCodeEmitter::getFixupKindInfo(MCFixupKind Kind) const
+{
+//  const static MCFixupKindInfo Infos[] = {};
+  if (Kind < FirstTargetFixupKind) {
+    return MCCodeEmitter::getFixupKindInfo(Kind);
+  }
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+      "Invalid kind!");
+  return MCCodeEmitter::getFixupKindInfo(Kind);
+ // return Infos[Kind - FirstTargetFixupKind];
+
+}
+
+void
+AMDILMCCodeEmitter::EmitByte(unsigned char C, unsigned &CurByte,
+    raw_ostream &OS) const
+{
+  OS << (char) C;
+  ++CurByte;
+}
+void
+AMDILMCCodeEmitter::EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+    raw_ostream &OS) const
+{
+  // Output the constant in little endian byte order
+  for (unsigned i = 0; i != Size; ++i) {
+    EmitByte(Val & 255, CurByte, OS);
+    Val >>= 8;
+  }
+}
+void
+AMDILMCCodeEmitter::EmitImmediate(const MCOperand &DispOp, unsigned ImmSize,
+    MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+    SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const
+{
+  // If this is a simple integer displacement that doesn't require a relocation
+  // emit it now.
+  if (DispOp.isImm()) {
+    EmitConstant(DispOp.getImm() + ImmOffset, ImmSize, CurByte, OS);
+  }
+
+  // If we have an immoffset, add it to the expression
+  const MCExpr *Expr = DispOp.getExpr();
+
+  if (ImmOffset) {
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+        MCConstantExpr::Create(ImmOffset, Ctx), Ctx);
+  }
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
+  // TODO: Why the 4 zeros?
+  EmitConstant(0, ImmSize, CurByte, OS);
+}
+
+void
+AMDILMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+    SmallVectorImpl<MCFixup> &Fixups) const
+{
+#if 0
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = TII.get(Opcode);
+  unsigned TSFlags = Desc.TSFlags;
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+
+  unsigned char BaseOpcode = 0;
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (// !Desc.isVariadic() &&
+      CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+#endif
+}
+#endif
diff --git a/lib/Target/AMDIL/AMDILMachinePeephole.cpp b/lib/Target/AMDIL/AMDILMachinePeephole.cpp
new file mode 100644
index 0000000..b8e5363
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILMachinePeephole.cpp
@@ -0,0 +1,173 @@
+//===-- AMDILMachinePeephole.cpp - AMDIL Machine Peephole Pass -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+
+#define DEBUG_TYPE "machine_peephole"
+#if !defined(NDEBUG)
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME (false)
+#endif
+
+#include "AMDIL.h"
+#include "AMDILSubtarget.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+namespace
+{
+  class AMDILMachinePeephole : public MachineFunctionPass
+  {
+    public:
+      static char ID;
+      AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+      //virtual ~AMDILMachinePeephole();
+      virtual const char*
+        getPassName() const;
+      virtual bool
+        runOnMachineFunction(MachineFunction &MF);
+    private:
+      void insertFence(MachineBasicBlock::iterator &MIB);
+      TargetMachine &TM;
+      bool mDebug;
+  }; // AMDILMachinePeephole
+  char AMDILMachinePeephole::ID = 0;
+} // anonymous namespace
+
+namespace llvm
+{
+  FunctionPass*
+    createAMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+    {
+      return new AMDILMachinePeephole(tm AMDIL_OPT_LEVEL_VAR);
+    }
+} // llvm namespace
+
+AMDILMachinePeephole::AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : MachineFunctionPass(ID), TM(tm)
+{
+  mDebug = DEBUGME;
+}
+
+bool
+AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
+{
+  bool Changed = false;
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  for (MachineFunction::iterator MBB = MF.begin(), MBE = MF.end();
+      MBB != MBE; ++MBB) {
+    MachineBasicBlock *mb = MBB;
+    for (MachineBasicBlock::iterator MIB = mb->begin(), MIE = mb->end();
+        MIB != MIE; ++MIB) {
+      MachineInstr *mi = MIB;
+      const char * name;
+      name = TM.getInstrInfo()->getName(mi->getOpcode());
+      switch (mi->getOpcode()) {
+        default:
+          if (isAtomicInst(TM.getInstrInfo(), mi)) {
+            // If we don't support the hardware accellerated address spaces,
+            // then the atomic needs to be transformed to the global atomic.
+            if (strstr(name, "_L_")
+                && STM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) {
+              BuildMI(*mb, MIB, mi->getDebugLoc(), 
+                  TM.getInstrInfo()->get(AMDIL::ADD_i32), AMDIL::R1011)
+                .addReg(mi->getOperand(1).getReg())
+                .addReg(AMDIL::T2);
+              mi->getOperand(1).setReg(AMDIL::R1011);
+              mi->setDesc(
+                  TM.getInstrInfo()->get(
+                    (mi->getOpcode() - AMDIL::ATOM_L_ADD) + AMDIL::ATOM_G_ADD));
+            } else if (strstr(name, "_R_")
+                && STM->device()->usesSoftware(AMDILDeviceInfo::RegionMem)) {
+              assert(!"Software region memory is not supported!");
+              mi->setDesc(
+                  TM.getInstrInfo()->get(
+                    (mi->getOpcode() - AMDIL::ATOM_R_ADD) + AMDIL::ATOM_G_ADD));
+            }
+          } else if ((isLoadInst(TM.getInstrInfo(), mi) || isStoreInst(TM.getInstrInfo(), mi)) && isVolatileInst(TM.getInstrInfo(), mi)) {
+            insertFence(MIB);
+          }
+          continue;
+          break;
+        case AMDIL::USHR_i16:
+        case AMDIL::USHR_v2i16:
+        case AMDIL::USHR_v4i16:
+        case AMDIL::USHRVEC_i16:
+        case AMDIL::USHRVEC_v2i16:
+        case AMDIL::USHRVEC_v4i16:
+          if (TM.getSubtarget<AMDILSubtarget>()
+              .device()->usesSoftware(AMDILDeviceInfo::ShortOps)) {
+            unsigned lReg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRI32RegClass);
+            unsigned Reg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRV4I32RegClass);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::LOADCONST_i32),
+                lReg).addImm(0xFFFF);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32),
+                Reg)
+              .addReg(mi->getOperand(1).getReg())
+              .addReg(lReg);
+            mi->getOperand(1).setReg(Reg);
+          }
+          break;
+        case AMDIL::USHR_i8:
+        case AMDIL::USHR_v2i8:
+        case AMDIL::USHR_v4i8:
+        case AMDIL::USHRVEC_i8:
+        case AMDIL::USHRVEC_v2i8:
+        case AMDIL::USHRVEC_v4i8:
+          if (TM.getSubtarget<AMDILSubtarget>()
+              .device()->usesSoftware(AMDILDeviceInfo::ByteOps)) {
+            unsigned lReg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRI32RegClass);
+            unsigned Reg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRV4I32RegClass);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::LOADCONST_i32),
+                lReg).addImm(0xFF);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32),
+                Reg)
+              .addReg(mi->getOperand(1).getReg())
+              .addReg(lReg);
+            mi->getOperand(1).setReg(Reg);
+          }
+          break;
+      }
+    }
+  }
+  return Changed;
+}
+
+const char*
+AMDILMachinePeephole::getPassName() const
+{
+  return "AMDIL Generic Machine Peephole Optimization Pass";
+}
+
+void
+AMDILMachinePeephole::insertFence(MachineBasicBlock::iterator &MIB)
+{
+  MachineInstr *MI = MIB;
+  MachineInstr *fence = BuildMI(*(MI->getParent()->getParent()),
+        MI->getDebugLoc(),
+        TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1);
+
+  MI->getParent()->insert(MIB, fence);
+  fence = BuildMI(*(MI->getParent()->getParent()),
+        MI->getDebugLoc(),
+        TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1);
+  MIB = MI->getParent()->insertAfter(MIB, fence);
+}
diff --git a/lib/Target/AMDIL/AMDILPeepholeOptimizer.cpp b/lib/Target/AMDIL/AMDILPeepholeOptimizer.cpp
new file mode 100644
index 0000000..5fe9f53
--- /dev/null
+++ b/lib/Target/AMDIL/AMDILPeepholeOptimizer.cpp
@@ -0,0 +1,1138 @@
+//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "PeepholeOpt"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILDevices.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <sstream>
+
+#if 0
+STATISTIC(PointerAssignments, "Number of dynamic pointer "
+    "assigments discovered");
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
+#endif
+
+using namespace llvm;
+// The Peephole optimization pass is used to do simple last minute optimizations
+// that are required for correct code or to remove redundant functions
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
+public:
+  TargetMachine &TM;
+  static char ID;
+  AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  ~AMDILPeepholeOpt();
+  const char *getPassName() const;
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+protected:
+private:
+  // Function to initiate all of the instruction level optimizations.
+  bool instLevelOptimizations(BasicBlock::iterator *inst);
+  // Quick check to see if we need to dump all of the pointers into the
+  // arena. If this is correct, then we set all pointers to exist in arena. This
+  // is a workaround for aliasing of pointers in a struct/union.
+  bool dumpAllIntoArena(Function &F);
+  // Because I don't want to invalidate any pointers while in the
+  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
+  // it later. This function does the conversions if required.
+  void doAtomicConversionIfNeeded(Function &F);
+  // Because __amdil_is_constant cannot be properly evaluated if
+  // optimizations are disabled, the call's are placed in a vector
+  // and evaluated after the __amdil_image* functions are evaluated
+  // which should allow the __amdil_is_constant function to be
+  // evaluated correctly.
+  void doIsConstCallConversionIfNeeded();
+  bool mChanged;
+  bool mDebug;
+  bool mConvertAtomics;
+  CodeGenOpt::Level optLevel;
+  // Run a series of tests to see if we can optimize a CALL instruction.
+  bool optimizeCallInst(BasicBlock::iterator *bbb);
+  // A peephole optimization to optimize bit extract sequences.
+  bool optimizeBitExtract(Instruction *inst);
+  // A peephole optimization to optimize bit insert sequences.
+  bool optimizeBitInsert(Instruction *inst);
+  bool setupBitInsert(Instruction *base, 
+                      Instruction *&src, 
+                      Constant *&mask, 
+                      Constant *&shift);
+  // Expand the bit field insert instruction on versions of OpenCL that
+  // don't support it.
+  bool expandBFI(CallInst *CI);
+  // Expand the bit field mask instruction on version of OpenCL that 
+  // don't support it.
+  bool expandBFM(CallInst *CI);
+  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
+  // this case we need to expand them. These functions check for 24bit functions
+  // and then expand.
+  bool isSigned24BitOps(CallInst *CI);
+  void expandSigned24BitOps(CallInst *CI);
+  // One optimization that can occur is that if the required workgroup size is
+  // specified then the result of get_local_size is known at compile time and
+  // can be returned accordingly.
+  bool isRWGLocalOpt(CallInst *CI);
+  // On northern island cards, the division is slightly less accurate than on
+  // previous generations, so we need to utilize a more accurate division. So we
+  // can translate the accurate divide to a normal divide on all other cards.
+  bool convertAccurateDivide(CallInst *CI);
+  void expandAccurateDivide(CallInst *CI);
+  // If the alignment is set incorrectly, it can produce really inefficient
+  // code. This checks for this scenario and fixes it if possible.
+  bool correctMisalignedMemOp(Instruction *inst);
+
+  // If we are in no opt mode, then we need to make sure that
+  // local samplers are properly propagated as constant propagation 
+  // doesn't occur and we need to know the value of kernel defined
+  // samplers at compile time.
+  bool propagateSamplerInst(CallInst *CI);
+
+  LLVMContext *mCTX;
+  Function *mF;
+  const AMDILSubtarget *mSTM;
+  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
+  SmallVector<CallInst *, 16> isConstVec;
+}; // class AMDILPeepholeOpt
+  char AMDILPeepholeOpt::ID = 0;
+} // anonymous namespace
+
+namespace llvm {
+  FunctionPass *
+  createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) 
+  {
+    return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
+  }
+} // llvm namespace
+
+AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : FunctionPass(ID), TM(tm) 
+{
+  mDebug = DEBUGME;
+  optLevel = TM.getOptLevel();
+
+}
+
+AMDILPeepholeOpt::~AMDILPeepholeOpt() 
+{
+}
+
+const char *
+AMDILPeepholeOpt::getPassName() const 
+{
+  return "AMDIL PeepHole Optimization Pass";
+}
+
+bool 
+containsPointerType(Type *Ty) 
+{
+  if (!Ty) {
+    return false;
+  }
+  switch(Ty->getTypeID()) {
+  default:
+    return false;
+  case Type::StructTyID: {
+    const StructType *ST = dyn_cast<StructType>(Ty);
+    for (StructType::element_iterator stb = ST->element_begin(),
+           ste = ST->element_end(); stb != ste; ++stb) {
+      if (!containsPointerType(*stb)) {
+        continue;
+      }
+      return true;
+    }
+    break;
+  }
+  case Type::VectorTyID:
+  case Type::ArrayTyID:
+    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
+  case Type::PointerTyID:
+    return true;
+  };
+  return false;
+}
+
+bool 
+AMDILPeepholeOpt::dumpAllIntoArena(Function &F) 
+{
+  bool dumpAll = false;
+  for (Function::const_arg_iterator cab = F.arg_begin(),
+       cae = F.arg_end(); cab != cae; ++cab) {
+    const Argument *arg = cab;
+    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
+    if (!PT) {
+      continue;
+    }
+    Type *DereferencedType = PT->getElementType();
+    if (!dyn_cast<StructType>(DereferencedType) 
+        ) {
+      continue;
+    }
+    if (!containsPointerType(DereferencedType)) {
+      continue;
+    }
+    // FIXME: Because a pointer inside of a struct/union may be aliased to
+    // another pointer we need to take the conservative approach and place all
+    // pointers into the arena until more advanced detection is implemented.
+    dumpAll = true;
+  }
+  return dumpAll;
+}
+void
+AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
+{
+  if (isConstVec.empty()) {
+    return;
+  }
+  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
+    CallInst *CI = isConstVec[x];
+    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+      : ConstantInt::get(aType, 0);
+    CI->replaceAllUsesWith(Val);
+    CI->eraseFromParent();
+  }
+  isConstVec.clear();
+}
+void 
+AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 
+{
+  // Don't do anything if we don't have any atomic operations.
+  if (atomicFuncs.empty()) {
+    return;
+  }
+  // Change the function name for the atomic if it is required
+  uint32_t size = atomicFuncs.size();
+  for (uint32_t x = 0; x < size; ++x) {
+    atomicFuncs[x].first->setOperand(
+        atomicFuncs[x].first->getNumOperands()-1, 
+        atomicFuncs[x].second);
+
+  }
+  mChanged = true;
+  if (mConvertAtomics) {
+    return;
+  }
+}
+
+bool 
+AMDILPeepholeOpt::runOnFunction(Function &MF) 
+{
+  mChanged = false;
+  mF = &MF;
+  mSTM = &TM.getSubtarget<AMDILSubtarget>();
+  if (mDebug) {
+    MF.dump();
+  }
+  mCTX = &MF.getType()->getContext();
+  mConvertAtomics = true;
+  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+     std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
+                  this));
+
+  doAtomicConversionIfNeeded(MF);
+  doIsConstCallConversionIfNeeded();
+
+  if (mDebug) {
+    MF.dump();
+  }
+  return mChanged;
+}
+
+bool 
+AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 
+{
+  Instruction *inst = (*bbb);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+  if (!CI) {
+    return false;
+  }
+  if (isSigned24BitOps(CI)) {
+    expandSigned24BitOps(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (propagateSamplerInst(CI)) {
+    return false;
+  }
+  if (expandBFI(CI) || expandBFM(CI)) {
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (convertAccurateDivide(CI)) {
+    expandAccurateDivide(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+
+  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
+  if (calleeName.startswith("__amdil_is_constant")) {
+    // If we do not have optimizations, then this
+    // cannot be properly evaluated, so we add the
+    // call instruction to a vector and process
+    // them at the end of processing after the
+    // samplers have been correctly handled.
+    if (optLevel == CodeGenOpt::None) {
+      isConstVec.push_back(CI);
+      return false;
+    } else {
+      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+      Type *aType = Type::getInt32Ty(*mCTX);
+      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+        : ConstantInt::get(aType, 0);
+      CI->replaceAllUsesWith(Val);
+      ++(*bbb);
+      CI->eraseFromParent();
+      return true;
+    }
+  }
+
+  if (calleeName.equals("__amdil_is_asic_id_i32")) {
+    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = CV;
+    if (Val) {
+      Val = ConstantInt::get(aType, 
+          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
+    } else {
+      Val = ConstantInt::get(aType, 0);
+    }
+    CI->replaceAllUsesWith(Val);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
+  if (!F) {
+    return false;
+  } 
+  if (F->getName().startswith("__atom") && !CI->getNumUses() 
+      && F->getName().find("_xchg") == StringRef::npos) {
+    std::string buffer(F->getName().str() + "_noret");
+    F = dyn_cast<Function>(
+          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
+    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
+  }
+  
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
+      && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+    return false;
+  }
+  if (!mConvertAtomics) {
+    return false;
+  }
+  StringRef name = F->getName();
+  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
+    mConvertAtomics = false;
+  }
+  return false;
+}
+
+bool
+AMDILPeepholeOpt::setupBitInsert(Instruction *base, 
+    Instruction *&src, 
+    Constant *&mask, 
+    Constant *&shift)
+{
+  if (!base) {
+    if (mDebug) {
+      dbgs() << "Null pointer passed into function.\n";
+    }
+    return false;
+  }
+  bool andOp = false;
+  if (base->getOpcode() == Instruction::Shl) {
+    shift = dyn_cast<Constant>(base->getOperand(1));
+  } else if (base->getOpcode() == Instruction::And) {
+    mask = dyn_cast<Constant>(base->getOperand(1));
+    andOp = true;
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
+    }
+    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
+    return false;
+  }
+  src = dyn_cast<Instruction>(base->getOperand(0));
+  if (!src) {
+    if (mDebug) {
+      dbgs() << "Failed setup since the base operand is not an instruction!\n";
+    }
+    return false;
+  }
+  // If we find an 'and' operation, then we don't need to
+  // find the next operation as we already know the
+  // bits that are valid at this point.
+  if (andOp) {
+    return true;
+  }
+  if (src->getOpcode() == Instruction::Shl && !shift) {
+    shift = dyn_cast<Constant>(src->getOperand(1));
+    src = dyn_cast<Instruction>(src->getOperand(0));
+  } else if (src->getOpcode() == Instruction::And && !mask) {
+    mask = dyn_cast<Constant>(src->getOperand(1));
+  }
+  if (!mask && !shift) {
+    if (mDebug) {
+      dbgs() << "Failed setup since both mask and shift are NULL!\n";
+    }
+    // Did not find a constant mask or a shift.
+    return false;
+  }
+  return true;
+}
+bool
+AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst) 
+{
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::Or) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do an optimization on a sequence of ops that in the end equals a
+  // single ISA instruction.
+  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
+  // Some simplified versions of this pattern are as follows:
+  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
+  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
+  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
+  // (A & B) | (D << F) when (1 << F) >= B
+  // (A << C) | (D & E) when (1 << C) >= E
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // The HD4XXX hardware doesn't support the ubit_insert instruction.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This optimization only works on 32bit integers.
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  // TODO: Handle vectors.
+  if (isVector) {
+    if (mDebug) {
+      dbgs() << "!!! Vectors are not supported yet!\n";
+    }
+    return false;
+  }
+  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
+  Constant *LHSMask = NULL, *RHSMask = NULL;
+  Constant *LHSShift = NULL, *RHSShift = NULL;
+  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
+  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
+  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (LHS) { LHS->dump(); }
+      if (LHSSrc) { LHSSrc->dump(); }
+      if (LHSMask) { LHSMask->dump(); }
+      if (LHSShift) { LHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (RHS) { RHS->dump(); }
+      if (RHSSrc) { RHSSrc->dump(); }
+      if (RHSMask) { RHSMask->dump(); }
+      if (RHSShift) { RHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
+    dbgs() << "Op:        "; inst->dump();
+    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
+  }
+  Constant *offset = NULL;
+  Constant *width = NULL;
+  int32_t lhsMaskVal = 0, rhsMaskVal = 0;
+  int32_t lhsShiftVal = 0, rhsShiftVal = 0;
+  int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
+  int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
+  lhsMaskVal = (int32_t)(LHSMask 
+      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
+  rhsMaskVal = (int32_t)(RHSMask 
+      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
+  lhsShiftVal = (int32_t)(LHSShift 
+      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
+  rhsShiftVal = (int32_t)(RHSShift 
+      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
+  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
+  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
+  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
+  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
+  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
+  if (mDebug) {
+      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
+      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
+      dbgs() << (RHSMask ? " & E)" : ")");
+      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
+      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
+      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
+      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
+      dbgs() << "width(B) = " << lhsMaskWidth;
+      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
+      dbgs() << "offset(B) = " << lhsMaskOffset;
+      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
+      dbgs() << "Constraints: \n";
+      dbgs() << "\t(1) B ^ E == 0\n";
+      dbgs() << "\t(2-LHS) B is a mask\n";
+      dbgs() << "\t(2-LHS) E is a mask\n";
+      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
+      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
+  }
+  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
+    if (mDebug) {
+      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
+      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
+      dbgs() << "Failed constraint 1!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "LHS = " << lhsMaskOffset << "";
+    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
+    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
+    dbgs() << "\nRHS = " << rhsMaskOffset << "";
+    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
+    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
+    dbgs() << "\n";
+  }
+  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
+    offset = ConstantInt::get(aType, lhsMaskOffset, false);
+    width = ConstantInt::get(aType, lhsMaskWidth, false);
+    RHSSrc = RHS;
+    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
+      if (mDebug) {
+        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
+        dbgs() << "Failed constraint 2!\n";
+      }
+      return false;
+    }
+    if (!LHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    } else if (lhsShiftVal != lhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing LHS!\n";
+    }
+  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
+    offset = ConstantInt::get(aType, rhsMaskOffset, false);
+    width = ConstantInt::get(aType, rhsMaskWidth, false);
+    LHSSrc = RHSSrc;
+    RHSSrc = LHS;
+    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
+      if (mDebug) {
+        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
+        dbgs() << "Failed constraint 2!\n";
+      }
+      return false;
+    }
+    if (!RHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    } else if (rhsShiftVal != rhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing RHS!\n";
+    }
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed constraint 3!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+  }
+  if (!offset || !width) {
+    if (mDebug) {
+      dbgs() << "Either width or offset are NULL, failed detection!\n";
+    }
+    return false;
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_insert";
+  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+        getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[4] = {
+    width,
+    offset,
+    LHSSrc,
+    RHSSrc
+  };
+  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
+  if (mDebug) {
+    dbgs() << "Old Inst: ";
+    inst->dump();
+    dbgs() << "New Inst: ";
+    CI->dump();
+    dbgs() << "\n\n";
+  }
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool 
+AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst) 
+{
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::And) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do some simple optimizations on Shift right/And patterns. The
+  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
+  // value smaller than 32 and C is a mask. If C is a constant value, then the
+  // following transformation can occur. For signed integers, it turns into the
+  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
+  // integers, it turns into the function call dst =
+  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
+  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
+  // Evergreen hardware.
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // This does not work on HD4XXX hardware.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This only works on 32bit integers
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
+  // If the first operand is not a shift instruction, then we can return as it
+  // doesn't match this pattern.
+  if (!ShiftInst || !ShiftInst->isShift()) {
+    return false;
+  }
+  // If we are a shift left, then we need don't match this pattern.
+  if (ShiftInst->getOpcode() == Instruction::Shl) {
+    return false;
+  }
+  bool isSigned = ShiftInst->isArithmeticShift();
+  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
+  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
+  // Lets make sure that the shift value and the and mask are constant integers.
+  if (!AndMask || !ShrVal) {
+    return false;
+  }
+  Constant *newMaskConst;
+  Constant *shiftValConst;
+  if (isVector) {
+    // Handle the vector case
+    std::vector<Constant *> maskVals;
+    std::vector<Constant *> shiftVals;
+    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
+    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
+    Type *scalarType = AndMaskVec->getType()->getScalarType();
+    assert(AndMaskVec->getNumOperands() ==
+           ShrValVec->getNumOperands() && "cannot have a "
+           "combination where the number of elements to a "
+           "shift and an and are different!");
+    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
+      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
+      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
+      if (!AndCI || !ShiftIC) {
+        return false;
+      }
+      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
+      if (!isMask_32(maskVal)) {
+        return false;
+      }
+      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
+      // If the mask or shiftval is greater than the bitcount, then break out.
+      if (maskVal >= 32 || shiftVal >= 32) {
+        return false;
+      }
+      // If the mask val is greater than the the number of original bits left
+      // then this optimization is invalid.
+      if (maskVal > (32 - shiftVal)) {
+        return false;
+      }
+      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
+      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
+    }
+    newMaskConst = ConstantVector::get(maskVals);
+    shiftValConst = ConstantVector::get(shiftVals);
+  } else {
+    // Handle the scalar case
+    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
+    // This must be a mask value where all lower bits are set to 1 and then any
+    // bit higher is set to 0.
+    if (!isMask_32(maskVal)) {
+      return false;
+    }
+    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+    // Count the number of bits set in the mask, this is the width of the
+    // resulting bit set that is extracted from the source value.
+    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
+    // If the mask or shift val is greater than the bitcount, then break out.
+    if (maskVal >= 32 || shiftVal >= 32) {
+      return false;
+    }
+    // If the mask val is greater than the the number of original bits left then
+    // this optimization is invalid.
+    if (maskVal > (32 - shiftVal)) {
+      return false;
+    }
+    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
+    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_extract";
+  if (isVector) {
+    name += "_v" + itostr(numEle) + "i32";
+  } else {
+    name += "_i32";
+  }
+  // Lets create the function.
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[3] = {
+    newMaskConst,
+    shiftValConst,
+    ShiftInst->getOperand(0)
+  };
+  // Lets create the Call with the operands
+  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::expandBFI(CallInst *CI)
+{
+  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfi")) {
+    return false;
+  }
+  Type* type = CI->getOperand(0)->getType();
+  Constant *negOneConst = NULL;
+  if (type->isVectorTy()) {
+    std::vector<Constant *> negOneVals;
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      negOneVals.push_back(negOneConst);
+    }
+    negOneConst = ConstantVector::get(negOneVals);
+  } else {
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+  }
+  // __amdil_bfi => (A & B) | (~A & C)
+  BinaryOperator *lhs = 
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        CI->getOperand(1), "bfi_and", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
+        "bfi_not", CI);
+  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
+      "bfi_and", CI);
+  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::expandBFM(CallInst *CI)
+{
+  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfm")) {
+    return false;
+  }
+  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
+  Constant *newMaskConst = NULL;
+  Constant *newShiftConst = NULL;
+  Type* type = CI->getOperand(0)->getType();
+  if (type->isVectorTy()) {
+    std::vector<Constant*> newMaskVals, newShiftVals;
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      newMaskVals.push_back(newMaskConst);
+      newShiftVals.push_back(newShiftConst);
+    }
+    newMaskConst = ConstantVector::get(newMaskVals);
+    newShiftConst = ConstantVector::get(newShiftVals);
+  } else {
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+  }
+  BinaryOperator *lhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
+      lhs, "bfm_shl", CI);
+  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
+      newShiftConst, "bfm_sub", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 
+{
+  Instruction *inst = (*bbb);
+  if (optimizeCallInst(bbb)) {
+    return true;
+  }
+  if (optimizeBitExtract(inst)) {
+    return false;
+  }
+  if (optimizeBitInsert(inst)) {
+    return false;
+  }
+  if (correctMisalignedMemOp(inst)) {
+    return false;
+  }
+  return false;
+}
+bool
+AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
+{
+  LoadInst *linst = dyn_cast<LoadInst>(inst);
+  StoreInst *sinst = dyn_cast<StoreInst>(inst);
+  unsigned alignment;
+  Type* Ty = inst->getType();
+  if (linst) {
+    alignment = linst->getAlignment();
+    Ty = inst->getType();
+  } else if (sinst) {
+    alignment = sinst->getAlignment();
+    Ty = sinst->getValueOperand()->getType();
+  } else {
+    return false;
+  }
+  unsigned size = getTypeSize(Ty);
+  if (size == alignment || size < alignment) {
+    return false;
+  }
+  if (!Ty->isStructTy()) {
+    return false;
+  }
+  if (alignment < 4) {
+    if (linst) {
+      linst->setAlignment(0);
+      return true;
+    } else if (sinst) {
+      sinst->setAlignment(0);
+      return true;
+    }
+  }
+  return false;
+}
+bool 
+AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI) 
+{
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  std::string namePrefix = LHS->getName().substr(0, 14);
+  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
+      && namePrefix != "__amdil__imul24_high") {
+    return false;
+  }
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
+    return false;
+  }
+  return true;
+}
+
+void 
+AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI) 
+{
+  assert(isSigned24BitOps(CI) && "Must be a "
+      "signed 24 bit operation to call this function!");
+  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
+  // On 7XX and 8XX we do not have signed 24bit, so we need to
+  // expand it to the following:
+  // imul24 turns into 32bit imul
+  // imad24 turns into 32bit imad
+  // imul24_high turns into 32bit imulhigh
+  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
+    Type *aType = CI->getOperand(0)->getType();
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    callTypes.push_back(CI->getOperand(2)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imad";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[3] = {
+      CI->getOperand(0),
+      CI->getOperand(1),
+      CI->getOperand(2)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
+    BinaryOperator *mulOp =
+      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
+          CI->getOperand(1), "imul24", CI);
+    CI->replaceAllUsesWith(mulOp);
+  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
+    Type *aType = CI->getOperand(0)->getType();
+
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imul_high";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[2] = {
+      CI->getOperand(0),
+      CI->getOperand(1)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  }
+}
+
+bool 
+AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI) 
+{
+  return (CI != NULL
+          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
+          == "__amdil_get_local_size_int");
+}
+
+bool 
+AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI) 
+{
+  if (!CI) {
+    return false;
+  }
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
+      && (mSTM->getDeviceName() == "cayman")) {
+    return false;
+  }
+  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
+      == "__amdil_improved_div";
+}
+
+void 
+AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI) 
+{
+  assert(convertAccurateDivide(CI)
+         && "expanding accurate divide can only happen if it is expandable!");
+  BinaryOperator *divOp =
+    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
+                           CI->getOperand(1), "fdiv32", CI);
+  CI->replaceAllUsesWith(divOp);
+}
+
+bool
+AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
+{
+  if (optLevel != CodeGenOpt::None) {
+    return false;
+  }
+
+  if (!CI) {
+    return false;
+  }
+
+  unsigned funcNameIdx = 0;
+  funcNameIdx = CI->getNumOperands() - 1;
+  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
+  if (calleeName != "__amdil_image2d_read_norm"
+   && calleeName != "__amdil_image2d_read_unnorm"
+   && calleeName != "__amdil_image3d_read_norm"
+   && calleeName != "__amdil_image3d_read_unnorm") {
+    return false;
+  }
+
+  unsigned samplerIdx = 2;
+  samplerIdx = 1;
+  Value *sampler = CI->getOperand(samplerIdx);
+  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
+  if (!lInst) {
+    return false;
+  }
+
+  if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+    return false;
+  }
+
+  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
+  // If we are loading from what is not a global value, then we
+  // fail and return.
+  if (!gv) {
+    return false;
+  }
+
+  // If we don't have an initializer or we have an initializer and
+  // the initializer is not a 32bit integer, we fail.
+  if (!gv->hasInitializer() 
+      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
+      return false;
+  }
+
+  // Now that we have the global variable initializer, lets replace
+  // all uses of the load instruction with the samplerVal and
+  // reparse the __amdil_is_constant() function.
+  Constant *samplerVal = gv->getInitializer();
+  lInst->replaceAllUsesWith(samplerVal);
+  return true;
+}
+
+bool 
+AMDILPeepholeOpt::doInitialization(Module &M) 
+{
+  return false;
+}
+
+bool 
+AMDILPeepholeOpt::doFinalization(Module &M) 
+{
+  return false;
+}
+
+void 
+AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 
+{
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
diff --git a/lib/Target/AMDIL/R600CodeEmitter.cpp b/lib/Target/AMDIL/R600CodeEmitter.cpp
new file mode 100644
index 0000000..8faf0de
--- /dev/null
+++ b/lib/Target/AMDIL/R600CodeEmitter.cpp
@@ -0,0 +1,749 @@
+//===-- R600CodeEmitter.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDILCodeEmitter.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <stdio.h>
+
+#define SRC_BYTE_COUNT 11
+#define DST_BYTE_COUNT 5
+
+using namespace llvm;
+
+namespace {
+
+  class R600CodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter {
+
+  private:
+
+  static char ID;
+  formatted_raw_ostream &_OS;
+  const TargetMachine * TM;
+  const MachineRegisterInfo * MRI;
+  const R600RegisterInfo * TRI;
+  bool evergreenEncoding;
+
+  bool isReduction;
+  unsigned reductionElement;
+  bool isLast;
+
+  unsigned section_start;
+
+  public:
+
+  R600CodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
+      _OS(OS), TM(NULL), evergreenEncoding(false), isReduction(false),
+      isLast(true) { }
+
+  const char *getPassName() const { return "AMDGPU Machine Code Emitter"; }
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                     const MachineOperand &MO) const;
+
+  private:
+
+  void emitALUInstr(MachineInstr  &MI);
+  void emitSrc(const MachineOperand & MO);
+  void emitDst(const MachineOperand & MO);
+  void emitALU(MachineInstr &MI, unsigned numSrc);
+  void emitTexInstr(MachineInstr &MI);
+  void emitFCInstr(MachineInstr &MI);
+
+  unsigned int getHWInst(const MachineInstr &MI);
+
+  void emitNullBytes(unsigned int byteCount);
+
+  void emitByte(unsigned int byte);
+
+  void emitTwoBytes(uint32_t bytes);
+
+  void emit(uint32_t value);
+  void emit(uint64_t value);
+
+  unsigned getHWReg(unsigned regNo) const;
+
+  unsigned getElement(unsigned regNo);
+
+};
+
+} /* End anonymous namespace */
+
+#define WRITE_MASK_X 0x1
+#define WRITE_MASK_Y 0x2
+#define WRITE_MASK_Z 0x4
+#define WRITE_MASK_W 0x8
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum InstrTypes {
+  INSTR_ALU = 0,
+  INSTR_TEX,
+  INSTR_FC,
+  INSTR_NATIVE,
+  INSTR_VTX
+};
+
+enum FCInstr {
+  FC_IF = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK,
+  FC_BREAK_NZ_INT,
+  FC_CONTINUE,
+  FC_BREAK_Z_INT
+};
+
+enum TextureTypes {
+  TEXTURE_1D = 1,
+  TEXTURE_2D, 
+  TEXTURE_3D,
+  TEXTURE_CUBE,
+  TEXTURE_RECT,
+  TEXTURE_SHADOW1D,
+  TEXTURE_SHADOW2D,
+  TEXTURE_SHADOWRECT,     
+  TEXTURE_1D_ARRAY,       
+  TEXTURE_2D_ARRAY,
+  TEXTURE_SHADOW1D_ARRAY,
+  TEXTURE_SHADOW2D_ARRAY
+};
+
+char R600CodeEmitter::ID = 0;
+
+FunctionPass *llvm::createR600CodeEmitterPass(formatted_raw_ostream &OS) {
+  return new R600CodeEmitter(OS);
+}
+
+bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+
+  TM = &MF.getTarget();
+  MRI = &MF.getRegInfo();
+  TRI = static_cast<const R600RegisterInfo *>(TM->getRegisterInfo());
+  const AMDILSubtarget &STM = TM->getSubtarget<AMDILSubtarget>();
+  std::string gpu = STM.getDeviceName();
+  if (!gpu.compare(0,3, "rv7")) {
+    evergreenEncoding = false;
+  } else {
+    evergreenEncoding = true;
+  }
+  const AMDGPUTargetMachine *amdtm =
+    static_cast<const AMDGPUTargetMachine *>(&MF.getTarget());
+
+  if (amdtm->shouldDumpCode()) {
+    MF.dump();
+  }
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+     MachineBasicBlock &MBB = *BB;
+     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                       I != E; ++I) {
+          MachineInstr &MI = *I;
+          if (MI.getNumOperands() > 1 && MI.getOperand(0).isReg() && MI.getOperand(0).isDead()) {
+            continue;
+          }
+          if (isTexOp(MI.getOpcode())) {
+            emitTexInstr(MI);
+          } else if (isFCOp(MI.getOpcode())){
+            emitFCInstr(MI);
+          } else if (isReductionOp(MI.getOpcode())) {
+            isReduction = true;
+            isLast = false;
+            for (reductionElement = 0; reductionElement < 4; reductionElement++) {
+              isLast = (reductionElement == 3);
+              emitALUInstr(MI);
+            }
+            isReduction = false;
+          } else if (MI.getOpcode() == AMDIL::RETURN ||
+                     MI.getOpcode() == AMDIL::BUNDLE ||
+                     MI.getOpcode() == AMDIL::KILL) {
+            continue;
+          } else {
+            switch(MI.getOpcode()) {
+            case AMDIL::RAT_WRITE_CACHELESS_eg:
+              {
+                /* XXX: Support for autoencoding 64-bit instructions was added
+                 * in LLVM 3.1.  Until we drop support for 3.0, we will use Magic
+                 * numbers for the high bits. */
+                  uint64_t high = 0x95c0100000000000;
+                  uint64_t inst = getBinaryCodeForInstr(MI);
+                  inst |= high;
+                /* Set End Of Program bit */
+                /* XXX: Need better check of end of program.  EOP should be
+                 * encoded in one of the operands of the MI, and it should be
+                 * set in a prior pass. */
+                MachineBasicBlock::iterator NextI = llvm::next(I);
+                MachineInstr &NextMI = *NextI;
+                if (NextMI.getOpcode() == AMDIL::RETURN) {
+                  inst |= (((uint64_t)1) << 53);
+                }
+                emitByte(INSTR_NATIVE);
+                emit(inst);
+                break;
+              }
+            case AMDIL::VTX_READ_eg:
+              {
+                emitByte(INSTR_VTX);
+                /* inst */
+                emitByte(0);
+
+                /* fetch_type */
+                emitByte(2);
+
+                /* buffer_id */
+                emitByte(MI.getOperand(2).getImm());
+
+                /* src_gpr */
+                emitByte(getHWReg(MI.getOperand(1).getReg()));
+
+                /* src_sel_x */
+                emitByte(TRI->getHWRegChan(MI.getOperand(1).getReg()));
+
+                /* mega_fetch_count */
+                emitByte(3);
+
+                /* dst_gpr */
+                emitByte(getHWReg(MI.getOperand(0).getReg()));
+
+                /* dst_sel_x */
+                emitByte(0);
+
+                /* dst_sel_y */
+                emitByte(7);
+
+                /* dst_sel_z */
+                emitByte(7);
+
+                /* dst_sel_w */
+                emitByte(7);
+
+                /* use_const_fields */
+                emitByte(1);
+
+                /* data_format */
+                emitByte(0);
+
+                /* num_format_all */
+                emitByte(0);
+
+                /* format_comp_all */
+                emitByte(0);
+
+                /* srf_mode_all */
+                emitByte(0);
+
+                /* offset */
+                emitByte(0);
+
+                /* endian */
+                emitByte(0);
+                break;
+              }
+
+            default:
+              emitALUInstr(MI);
+              break;
+          }
+        }
+    }
+  }
+  return false;
+}
+
+void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
+{
+
+  unsigned numOperands = MI.getNumExplicitOperands();
+
+   /* Some instructions are just place holder instructions that represent
+    * operations that the GPU does automatically.  They should be ignored. */
+  if (isPlaceHolderOpcode(MI.getOpcode())) {
+    return;
+  }
+
+  /* We need to handle some opcodes differently */
+  switch (MI.getOpcode()) {
+    default: break;
+
+    /* XXX: Temp Hack */
+    case AMDIL::STORE_OUTPUT:
+      numOperands = 2;
+      break;
+  }
+
+  /* XXX Check if instruction writes a result */
+  if (numOperands < 1) {
+    return;
+  }
+  const MachineOperand dstOp = MI.getOperand(0);
+
+  /* Emit instruction type */
+  emitByte(0);
+
+  unsigned int opIndex;
+  for (opIndex = 1; opIndex < numOperands; opIndex++) {
+    /* Literal constants are always stored as the last operand. */
+    if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
+      break;
+    }
+    emitSrc(MI.getOperand(opIndex));
+  }
+
+    /* Emit zeros for unused sources */
+  for ( ; opIndex < 4; opIndex++) {
+    emitNullBytes(SRC_BYTE_COUNT);
+  }
+
+  emitDst(dstOp);
+
+  emitALU(MI, numOperands - 1);
+}
+
+void R600CodeEmitter::emitSrc(const MachineOperand & MO)
+{
+  uint32_t value = 0;
+  /* Emit the source select (2 bytes).  For GPRs, this is the register index.
+   * For other potential instruction operands, (e.g. constant registers) the
+   * value of the source select is defined in the r600isa docs. */
+  if (MO.isReg()) {
+    unsigned reg = MO.getReg();
+    emitTwoBytes(getHWReg(reg));
+    if (reg == AMDIL::ALU_LITERAL_X) {
+      const MachineInstr * parent = MO.getParent();
+      unsigned immOpIndex = parent->getNumExplicitOperands() - 1;
+      MachineOperand immOp = parent->getOperand(immOpIndex);
+      if (immOp.isFPImm()) {
+        value = immOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue();
+      } else {
+        assert(immOp.isImm());
+        value = immOp.getImm();
+      }
+    }
+  } else {
+    /* XXX: Handle other operand types. */
+    emitTwoBytes(0);
+  }
+
+  /* Emit the source channel (1 byte) */
+  if (isReduction) {
+    emitByte(reductionElement);
+  } else if (MO.isReg()) {
+    emitByte(TRI->getHWRegChan(MO.getReg()));
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit isNegated (1 byte) */
+  if ((!(MO.getTargetFlags() & MO_FLAG_ABS))
+      && (MO.getTargetFlags() & MO_FLAG_NEG ||
+     (MO.isReg() &&
+      (MO.getReg() == AMDIL::NEG_ONE || MO.getReg() == AMDIL::NEG_HALF)))){
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* Emit isAbsolute (1 byte) */
+  if (MO.getTargetFlags() & MO_FLAG_ABS) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit relative addressing mode (1 byte) */
+  emitByte(0);
+
+  /* Emit kc_bank, This will be adjusted later by r600_asm */
+  emitByte(0);
+
+  /* Emit the literal value, if applicable (4 bytes).  */
+  emit(value);
+
+}
+
+void R600CodeEmitter::emitDst(const MachineOperand & MO)
+{
+  if (MO.isReg()) {
+    /* Emit the destination register index (1 byte) */
+    emitByte(getHWReg(MO.getReg()));
+
+    /* Emit the element of the destination register (1 byte)*/
+    if (isReduction) {
+      emitByte(reductionElement);
+    } else {
+      emitByte(TRI->getHWRegChan(MO.getReg()));
+    }
+
+    /* Emit isClamped (1 byte) */
+    if (MO.getTargetFlags() & MO_FLAG_CLAMP) {
+      emitByte(1);
+    } else {
+      emitByte(0);
+    }
+
+    /* Emit writemask (1 byte).  */
+    if ((isReduction && reductionElement != TRI->getHWRegChan(MO.getReg()))
+         || MO.getTargetFlags() & MO_FLAG_MASK) {
+      emitByte(0);
+    } else {
+      emitByte(1);
+    }
+
+    /* XXX: Emit relative addressing mode */
+    emitByte(0);
+  } else {
+    /* XXX: Handle other operand types.  Are there any for destination regs? */
+    emitNullBytes(DST_BYTE_COUNT);
+  }
+}
+
+void R600CodeEmitter::emitALU(MachineInstr &MI, unsigned numSrc)
+{
+  /* Emit the instruction (2 bytes) */
+  emitTwoBytes(getHWInst(MI));
+
+  /* Emit isLast (for this instruction group) (1 byte) */
+  if (isLast) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+  /* Emit isOp3 (1 byte) */
+  if (numSrc == 3) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit predicate (1 byte) */
+  emitByte(0);
+
+  /* XXX: Emit bank swizzle. (1 byte)  Do we need this?  It looks like
+   * r600_asm.c sets it. */
+  emitByte(0);
+
+  /* XXX: Emit bank_swizzle_force (1 byte) Not sure what this is for. */
+  emitByte(0);
+
+  /* XXX: Emit OMOD (1 byte) Not implemented. */
+  emitByte(0);
+
+  /* XXX: Emit index_mode.  I think this is for indirect addressing, so we
+   * don't need to worry about it. */
+  emitByte(0);
+}
+
+void R600CodeEmitter::emitTexInstr(MachineInstr &MI)
+{
+
+  int64_t sampler = MI.getOperand(2).getImm();
+  int64_t textureType = MI.getOperand(3).getImm();
+  unsigned opcode = MI.getOpcode();
+  unsigned srcSelect[4] = {0, 1, 2, 3};
+
+  /* Emit instruction type */
+  emitByte(1);
+
+  /* Emit instruction */
+  emitByte(getHWInst(MI));
+
+  /* XXX: Emit resource id r600_shader.c uses sampler + 1.  Why? */
+  emitByte(sampler + 1 + 1);
+
+  /* Emit source register */
+  emitByte(getHWReg(MI.getOperand(1).getReg()));
+
+  /* XXX: Emit src isRelativeAddress */
+  emitByte(0);
+
+  /* Emit destination register */
+  emitByte(getHWReg(MI.getOperand(0).getReg()));
+
+  /* XXX: Emit dst isRealtiveAddress */
+  emitByte(0);
+
+  /* XXX: Emit dst select */
+  emitByte(0); /* X */
+  emitByte(1); /* Y */
+  emitByte(2); /* Z */
+  emitByte(3); /* W */
+
+  /* XXX: Emit lod bias */
+  emitByte(0);
+
+  /* XXX: Emit coord types */
+  unsigned coordType[4] = {1, 1, 1, 1};
+
+  if (textureType == TEXTURE_RECT
+      || textureType == TEXTURE_SHADOWRECT) {
+    coordType[ELEMENT_X] = 0;
+    coordType[ELEMENT_Y] = 0;
+  }
+
+  if (textureType == TEXTURE_1D_ARRAY
+      || textureType == TEXTURE_SHADOW1D_ARRAY) {
+    if (opcode == AMDIL::TEX_SAMPLE_C_L || opcode == AMDIL::TEX_SAMPLE_C_LB) {
+      coordType[ELEMENT_Y] = 0;
+    } else {
+      coordType[ELEMENT_Z] = 0;
+      srcSelect[ELEMENT_Z] = ELEMENT_Y;
+    }
+  } else if (textureType == TEXTURE_2D_ARRAY
+             || textureType == TEXTURE_SHADOW2D_ARRAY) {
+    coordType[ELEMENT_Z] = 0;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    emitByte(coordType[i]);
+  }
+
+  /* XXX: Emit offsets */
+  emitByte(0); /* X */
+  emitByte(0); /* Y */
+  emitByte(0); /* Z */
+  /* There is no OFFSET_W */
+
+  /* Emit sampler id */
+  emitByte(sampler);
+
+  /* XXX:Emit source select */
+  if ((textureType == TEXTURE_SHADOW1D
+      || textureType == TEXTURE_SHADOW2D
+      || textureType == TEXTURE_SHADOWRECT
+      || textureType == TEXTURE_SHADOW1D_ARRAY)
+      && opcode != AMDIL::TEX_SAMPLE_C_L
+      && opcode != AMDIL::TEX_SAMPLE_C_LB) {
+    srcSelect[ELEMENT_W] = ELEMENT_Z;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    emitByte(srcSelect[i]);
+  }
+}
+
+void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
+{
+  /* Emit instruction type */
+  emitByte(INSTR_FC);
+
+  /* Emit SRC */
+  unsigned numOperands = MI.getNumOperands();
+  if (numOperands > 0) {
+    assert(numOperands == 1);
+    emitSrc(MI.getOperand(0));
+  } else {
+    emitNullBytes(SRC_BYTE_COUNT);
+  }
+
+  /* Emit FC Instruction */
+  enum FCInstr instr;
+  switch (MI.getOpcode()) {
+  case AMDIL::BREAK_LOGICALZ_f32:
+    instr = FC_BREAK;
+    break;
+  case AMDIL::BREAK_LOGICALNZ_i32:
+    instr = FC_BREAK_NZ_INT;
+    break;
+  case AMDIL::BREAK_LOGICALZ_i32:
+    instr = FC_BREAK_Z_INT;
+    break;
+  case AMDIL::CONTINUE_LOGICALNZ_f32:
+    instr = FC_CONTINUE;
+    break;
+  /* XXX: This assumes that all IFs will be if (x != 0).  If we add
+   * optimizations this might not be the case */
+  case AMDIL::IF_LOGICALNZ_f32:
+  case AMDIL::IF_LOGICALNZ_i32:
+    instr = FC_IF;
+    break;
+  case AMDIL::IF_LOGICALZ_f32:
+    abort();
+    break;
+  case AMDIL::ELSE:
+    instr = FC_ELSE;
+    break;
+  case AMDIL::ENDIF:
+    instr = FC_ENDIF;
+    break;
+  case AMDIL::ENDLOOP:
+    instr = FC_ENDLOOP;
+    break;
+  case AMDIL::WHILELOOP:
+    instr = FC_BGNLOOP;
+    break;
+  default:
+    abort();
+    break;
+  }
+  emitByte(instr);
+}
+
+#define INSTR_FLOAT2_V(inst, hw) \
+  case AMDIL:: inst##_v4f32: \
+  case AMDIL:: inst##_v2f32: return HW_INST2(hw);
+
+#define INSTR_FLOAT2_S(inst, hw) \
+  case AMDIL:: inst##_f32: return HW_INST2(hw);
+
+#define INSTR_FLOAT2(inst, hw) \
+  INSTR_FLOAT2_V(inst, hw) \
+  INSTR_FLOAT2_S(inst, hw)
+
+unsigned int R600CodeEmitter::getHWInst(const MachineInstr &MI)
+{
+
+  /* XXX: Lower these to MOV before the code emitter. */
+  switch (MI.getOpcode()) {
+    case AMDIL::STORE_OUTPUT:
+    case AMDIL::VCREATE_v4i32:
+    case AMDIL::LOADCONST_i32:
+    case AMDIL::LOADCONST_f32:
+    case AMDIL::MOVE_v4i32:
+    /* Instructons to reinterpret bits as ... */
+    case AMDIL::IL_ASINT_f32:
+    case AMDIL::IL_ASINT_i32:
+    case AMDIL::IL_ASFLOAT_f32:
+    case AMDIL::IL_ASFLOAT_i32:
+      return 0x19;
+
+  default:
+    return getBinaryCodeForInstr(MI);
+  }
+}
+
+void R600CodeEmitter::emitNullBytes(unsigned int byteCount)
+{
+  for (unsigned int i = 0; i < byteCount; i++) {
+    emitByte(0);
+  }
+}
+
+void R600CodeEmitter::emitByte(unsigned int byte)
+{
+  _OS.write((uint8_t) byte & 0xff);
+}
+void R600CodeEmitter::emitTwoBytes(unsigned int bytes)
+{
+  _OS.write((uint8_t) (bytes & 0xff));
+  _OS.write((uint8_t) ((bytes >> 8) & 0xff));
+}
+
+void R600CodeEmitter::emit(uint32_t value)
+{
+  for (unsigned i = 0; i < 4; i++) {
+    _OS.write((uint8_t) ((value >> (8 * i)) & 0xff));
+  }
+}
+
+void R600CodeEmitter::emit(uint64_t value)
+{
+  for (unsigned i = 0; i < 8; i++) {
+    emitByte((value >> (8 * i)) & 0xff);
+  }
+}
+
+unsigned R600CodeEmitter::getHWReg(unsigned regNo) const
+{
+  unsigned hwReg;
+
+  hwReg = TRI->getHWRegIndex(regNo);
+  if (AMDIL::R600_CReg32RegClass.contains(regNo)) {
+    hwReg += 512;
+  }
+  return hwReg;
+}
+
+uint64_t R600CodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                            const MachineOperand &MO) const
+{
+  if (MO.isReg()) {
+    return getHWReg(MO.getReg());
+  } else {
+    return MO.getImm();
+  }
+}
+
+
+RegElement maskBitToElement(unsigned int maskBit)
+{
+  switch (maskBit) {
+    case WRITE_MASK_X: return ELEMENT_X;
+    case WRITE_MASK_Y: return ELEMENT_Y;
+    case WRITE_MASK_Z: return ELEMENT_Z;
+    case WRITE_MASK_W: return ELEMENT_W;
+    default:
+      assert("Invalid maskBit");
+      return ELEMENT_X;
+  }
+}
+
+unsigned int dstSwizzleToWriteMask(unsigned swizzle)
+{
+  switch(swizzle) {
+  default:
+  case AMDIL_DST_SWIZZLE_DEFAULT:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X___:
+    return WRITE_MASK_X;
+  case AMDIL_DST_SWIZZLE_XY__:
+    return WRITE_MASK_X | WRITE_MASK_Y;
+  case AMDIL_DST_SWIZZLE_XYZ_:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE_XYZW:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE__Y__:
+    return WRITE_MASK_Y;
+  case AMDIL_DST_SWIZZLE__YZ_:
+    return WRITE_MASK_Y | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE__YZW:
+    return WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE___Z_:
+    return WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE___ZW:
+    return WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE____W:
+    return WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X_ZW:
+    return WRITE_MASK_X | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_XY_W:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X_Z_:
+    return WRITE_MASK_X | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE_X__W:
+    return WRITE_MASK_X | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE__Y_W:
+    return WRITE_MASK_Y | WRITE_MASK_W;
+  }
+}
+
+#include "AMDILGenCodeEmitter.inc"
+
diff --git a/lib/Target/AMDIL/R600KernelParameters.cpp b/lib/Target/AMDIL/R600KernelParameters.cpp
new file mode 100644
index 0000000..3fdf48a
--- /dev/null
+++ b/lib/Target/AMDIL/R600KernelParameters.cpp
@@ -0,0 +1,503 @@
+//===-- R600KernelParameters.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm-c/Core.h>
+#include "R600KernelParameters.h"
+#include "R600OpenCLUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+// #include "llvm/CodeGen/Function.h"
+
+namespace AMDILAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, // Address space for private memory.
+  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, // Address space for constant memory.
+  LOCAL_ADDRESS    = 3, // Address space for local memory.
+  REGION_ADDRESS   = 4, // Address space for region memory.
+  ADDRESS_NONE     = 5, // Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
+  LAST_ADDRESS     = 8
+};
+}
+
+
+#include <map>
+#include <set>
+
+using namespace llvm;
+using namespace std;
+
+#define CONSTANT_CACHE_SIZE_DW 127
+
+class R600KernelParameters : public llvm::FunctionPass
+{
+  const llvm::TargetData * TD;
+  LLVMContext* Context;
+  Module *mod;
+  
+  struct param
+  {
+    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0), indirect(false), specialID(0) {}
+    
+    llvm::Value* val;
+    llvm::Value* ptr_val;
+    int offset_in_dw;
+    int size_in_dw;
+
+    bool indirect;
+    
+    string specialType;
+    int specialID;
+    
+    int end() { return offset_in_dw + size_in_dw; }
+    /* The first 9 dwords are reserved for the grid sizes. */
+    int get_rat_offset() { return 9 + offset_in_dw; }
+  };
+
+  std::vector<param> params;
+
+  int getLastSpecialID(const string& TypeName);
+  
+  int getListSize();
+  void AddParam(llvm::Argument* arg);
+  int calculateArgumentSize(llvm::Argument* arg);
+  void RunAna(llvm::Function* fun);
+  void Replace(llvm::Function* fun);
+  bool isIndirect(Value* val, set<Value*>& visited);
+  void Propagate(llvm::Function* fun);
+  void Propagate(llvm::Value* v, const llvm::Twine& name, bool indirect = false);
+  Value* ConstantRead(Function* fun, param& p);
+  Value* handleSpecial(Function* fun, param& p);
+  bool isSpecialType(Type*);
+  string getSpecialTypeName(Type*);
+public:
+  static char ID;
+  R600KernelParameters() : FunctionPass(ID) {};
+  R600KernelParameters(const llvm::TargetData* TD) : FunctionPass(ID), TD(TD) {}
+//   bool runOnFunction (llvm::Function &F);
+  bool runOnFunction (llvm::Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  const char *getPassName() const;
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+};
+
+char R600KernelParameters::ID = 0;
+
+static RegisterPass<R600KernelParameters> X("kerparam", "OpenCL Kernel Parameter conversion", false, false);
+
+int R600KernelParameters::getLastSpecialID(const string& TypeName)
+{
+  int lastID = -1;
+  
+  for (vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    if (i->specialType == TypeName)
+    {
+      lastID = i->specialID;
+    }
+  }
+
+  return lastID;
+}
+
+int R600KernelParameters::getListSize()
+{
+  if (params.size() == 0)
+  {
+    return 0;
+  }
+
+  return params.back().end();
+}
+
+bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
+{
+  if (isa<LoadInst>(val))
+  {
+    return false;
+  }
+
+  if (isa<IntegerType>(val->getType()))
+  {
+    assert(0 and "Internal error");
+    return false;
+  }
+
+  if (visited.count(val))
+  {
+    return false;
+  }
+
+  visited.insert(val);
+  
+  if (isa<GetElementPtrInst>(val))
+  {
+    GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(val);
+    GetElementPtrInst::op_iterator i = GEP->op_begin();
+
+    for (i++; i != GEP->op_end(); i++)
+    {
+      if (!isa<Constant>(*i))
+      {
+        return true;
+      }
+    }
+  }
+  
+  for (Value::use_iterator i = val->use_begin(); i != val->use_end(); i++)
+  {
+    Value* v2 = dyn_cast<Value>(*i);
+
+    if (v2)
+    {
+      if (isIndirect(v2, visited))
+      {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void R600KernelParameters::AddParam(llvm::Argument* arg)
+{
+  param p;
+  
+  p.val = dyn_cast<Value>(arg);
+  p.offset_in_dw = getListSize();
+  p.size_in_dw = calculateArgumentSize(arg);
+
+  if (isa<PointerType>(arg->getType()) and arg->hasByValAttr())
+  {
+    set<Value*> visited;
+    p.indirect = isIndirect(p.val, visited);
+  }
+  
+  params.push_back(p);
+}
+
+int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
+{
+  Type* t = arg->getType();
+
+  if (arg->hasByValAttr() and dyn_cast<PointerType>(t))
+  {
+    t = dyn_cast<PointerType>(t)->getElementType();
+  }
+  
+  int store_size_in_dw = (TD->getTypeStoreSize(t) + 3)/4;
+
+  assert(store_size_in_dw);
+  
+  return store_size_in_dw;
+}
+
+
+void R600KernelParameters::RunAna(llvm::Function* fun)
+{
+  assert(isOpenCLKernel(fun));
+
+  for (Function::arg_iterator i = fun->arg_begin(); i != fun->arg_end(); i++)
+  {
+    AddParam(i);
+  }
+
+}
+
+void R600KernelParameters::Replace(llvm::Function* fun)
+{
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    Value *new_val;
+
+    if (isSpecialType(i->val->getType()))
+    {
+      new_val = handleSpecial(fun, *i);
+    }
+    else
+    {
+      new_val = ConstantRead(fun, *i);
+    }
+    if (new_val)
+    {
+      i->val->replaceAllUsesWith(new_val);
+    }   
+  }
+}
+
+void R600KernelParameters::Propagate(llvm::Function* fun)
+{
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    if (i->ptr_val)
+    {
+      Propagate(i->ptr_val, i->val->getName(), i->indirect);
+   }
+  }
+}
+
+void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
+{
+  LoadInst* load = dyn_cast<LoadInst>(v);
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v);
+  
+  unsigned addrspace; 
+
+  if (indirect)
+  {
+    addrspace = AMDILAS::PARAM_I_ADDRESS;
+  }
+  else
+  {
+    addrspace = AMDILAS::PARAM_D_ADDRESS;
+  }
+
+  if (GEP and GEP->getType()->getAddressSpace() != addrspace)
+  {
+    Value* op = GEP->getPointerOperand();
+
+    if (dyn_cast<PointerType>(op->getType())->getAddressSpace() != addrspace)
+    {
+      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(op->getType())->getElementType(), addrspace), name, dyn_cast<Instruction>(v));
+    }
+
+    vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
+    
+    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name, dyn_cast<Instruction>(v));
+    GEP2->setIsInBounds(GEP->isInBounds());
+    v = dyn_cast<Value>(GEP2);
+    GEP->replaceAllUsesWith(GEP2);
+    GEP->eraseFromParent();
+    load = NULL;
+  }
+  
+  if (load)
+  {
+    if (load->getPointerAddressSpace() != addrspace) ///normally at this point we have the right address space
+    {
+      Value *orig_ptr = load->getPointerOperand();
+      PointerType *orig_ptr_type = dyn_cast<PointerType>(orig_ptr->getType());
+      
+      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(), addrspace);
+
+      Value* new_ptr = orig_ptr;
+      
+      if (orig_ptr->getType() != new_ptr_type)
+      {
+        new_ptr = new BitCastInst(orig_ptr, new_ptr_type, "prop_cast", load);
+      }
+      
+      Value* new_load = new LoadInst(new_ptr, name, load);
+      load->replaceAllUsesWith(new_load);
+      load->eraseFromParent();
+    }
+    
+    return;
+  }
+
+  vector<User*> users(v->use_begin(), v->use_end());
+  
+  for (int i = 0; i < int(users.size()); i++)
+  {
+    Value* v2 = dyn_cast<Value>(users[i]);
+    
+    if (v2)
+    {
+      Propagate(v2, name, indirect);
+    }
+  }
+}
+
+Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
+{
+  assert(fun->front().begin() != fun->front().end());
+  
+  Instruction *first_inst = fun->front().begin();
+  IRBuilder <> builder (first_inst);
+/* First 3 dwords are reserved for the dimmension info */
+
+  if (!p.val->hasNUsesOrMore(1))
+  {
+    return NULL;
+  }
+  unsigned addrspace;
+
+  if (p.indirect)
+  {
+    addrspace = AMDILAS::PARAM_I_ADDRESS;
+  }
+  else
+  {
+    addrspace = AMDILAS::PARAM_D_ADDRESS;
+  }
+  
+  Argument *arg = dyn_cast<Argument>(p.val);
+  Type * argType = p.val->getType();
+  PointerType * argPtrType = dyn_cast<PointerType>(p.val->getType());
+  
+  if (argPtrType and arg->hasByValAttr())
+  {
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(Type::getInt32Ty(*Context), addrspace));
+    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr, ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName(), first_inst);
+    param_ptr = new BitCastInst(param_ptr, PointerType::get(argPtrType->getElementType(), addrspace), arg->getName(), first_inst);
+    p.ptr_val = param_ptr;
+    return param_ptr;
+  }
+  else
+  {
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(argType, addrspace));
+    
+    Value* param_ptr = builder.CreateGEP(param_addr_space_ptr,
+             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName());
+    
+    Value* param_value = builder.CreateLoad(param_ptr, arg->getName());
+    
+    return param_value;
+  }
+}
+
+Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
+{
+  string name = getSpecialTypeName(p.val->getType());
+  int ID;
+
+  assert(!name.empty());
+  
+  if (name == "image2d_t" or name == "image3d_t")
+  {
+    int lastID = max(getLastSpecialID("image2d_t"), getLastSpecialID("image3d_t"));
+    
+    if (lastID == -1)
+    {
+      ID = 2; ///ID0 and ID1 are used internally by the driver
+    }
+    else
+    {
+      ID = lastID + 1;
+    }
+  }
+  else if (name == "sampler_t")
+  {
+    int lastID = getLastSpecialID("sampler_t");
+
+    if (lastID == -1)
+    {
+      ID = 0;
+    }
+    else
+    {
+      ID = lastID + 1;
+    }    
+  }
+  else
+  {
+    ///TODO: give some error message
+    return NULL;
+  }
+    
+  p.specialType = name;
+  p.specialID = ID;
+
+  Instruction *first_inst = fun->front().begin();
+
+  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context), p.specialID), p.val->getType(), "resourceID", first_inst);
+}
+
+
+bool R600KernelParameters::isSpecialType(Type* t)
+{
+  return !getSpecialTypeName(t).empty();
+}
+
+string R600KernelParameters::getSpecialTypeName(Type* t)
+{
+  PointerType *pt = dyn_cast<PointerType>(t);
+  StructType *st = NULL;
+
+  if (pt)
+  {
+    st = dyn_cast<StructType>(pt->getElementType());
+  }
+
+  if (st)
+  {
+    string prefix = "struct.opencl_builtin_type_";
+    
+    string name = st->getName().str();
+
+    if (name.substr(0, prefix.length()) == prefix)
+    {
+      return name.substr(prefix.length(), name.length());
+    }
+  }
+
+  return "";
+}
+
+
+bool R600KernelParameters::runOnFunction (Function &F)
+{
+  if (!isOpenCLKernel(&F))
+  {
+    return false;
+  }
+
+//  F.dump();
+  
+  RunAna(&F);
+  Replace(&F);
+  Propagate(&F);
+  
+   mod->dump();
+  return false;
+}
+
+void R600KernelParameters::getAnalysisUsage(AnalysisUsage &AU) const
+{
+//   AU.addRequired<FunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+const char *R600KernelParameters::getPassName() const
+{
+  return "OpenCL Kernel parameter conversion to memory";
+}
+
+bool R600KernelParameters::doInitialization(Module &M)
+{
+  Context = &M.getContext();
+  mod = &M;
+  
+  return false;
+}
+
+bool R600KernelParameters::doFinalization(Module &M)
+{
+  return false;
+}
+
+llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD)
+{
+  FunctionPass *p = new R600KernelParameters(TD);
+  
+  return p;
+}
+
+
diff --git a/lib/Target/AMDIL/R600KernelParameters.h b/lib/Target/AMDIL/R600KernelParameters.h
new file mode 100644
index 0000000..904a469
--- /dev/null
+++ b/lib/Target/AMDIL/R600KernelParameters.h
@@ -0,0 +1,28 @@
+//===-- R600KernelParameters.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KERNELPARAMETERS_H
+#define KERNELPARAMETERS_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Value.h"
+
+#include <vector>
+
+llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD);
+
+
+#endif
diff --git a/lib/Target/AMDIL/R600LowerInstructions.cpp b/lib/Target/AMDIL/R600LowerInstructions.cpp
new file mode 100644
index 0000000..fb5431d
--- /dev/null
+++ b/lib/Target/AMDIL/R600LowerInstructions.cpp
@@ -0,0 +1,502 @@
+//===-- R600LowerInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUUtil.h"
+#include "AMDIL.h"
+#include "AMDILRegisterInfo.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include <stdio.h>
+
+using namespace llvm;
+
+namespace {
+  class R600LowerInstructionsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+    const R600InstrInfo * TII;
+    MachineRegisterInfo * MRI;
+
+    void lowerFLT(MachineInstr &MI);
+
+    void calcAddress(const MachineOperand &ptrOp,
+                     const MachineOperand &indexOp,
+                     unsigned indexReg,
+                     MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I) const;
+
+    void divMod(MachineInstr &MI,
+                  MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  bool div = true) const;
+
+  public:
+    R600LowerInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm),
+      TII(static_cast<const R600InstrInfo*>(tm.getInstrInfo())),
+      MRI(NULL)
+      { }
+
+    const char *getPassName() const { return "R600 Lower Instructions"; }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char R600LowerInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createR600LowerInstructionsPass(TargetMachine &tm) {
+  return new R600LowerInstructionsPass(tm);
+}
+
+bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+
+      MachineInstr &MI = *I;
+      switch(MI.getOpcode()) {
+      case AMDIL::FLT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FGE))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::ABS_i32:
+        {
+          unsigned setgt = MRI->createVirtualRegister(
+                           &AMDIL::R600_TReg32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT),
+                  setgt)
+                  .addOperand(MI.getOperand(1))
+                  .addReg(AMDIL::ZERO);
+
+          unsigned add_int = MRI->createVirtualRegister(
+                             &AMDIL::R600_TReg32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT),
+                  add_int)
+                  .addReg(setgt)
+                  .addOperand(MI.getOperand(1));
+
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::XOR_INT))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(setgt)
+                  .addReg(add_int);
+
+          break;
+        }
+
+      /* XXX: We could propagate the ABS flag to all of the uses of Operand0 and
+       * remove the ABS instruction.*/
+      case AMDIL::FABS_f32:
+      case AMDIL::ABS_f32:
+        MI.getOperand(1).addTargetFlag(MO_FLAG_ABS);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::MOVE_f32))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::BINARY_OR_f32:
+        {
+        unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp0)
+                .addOperand(MI.getOperand(1));
+        unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp1)
+                .addOperand(MI.getOperand(2));
+        unsigned tmp2 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::BINARY_OR_i32), tmp2)
+                .addReg(tmp0)
+                .addReg(tmp1);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::ITOF), MI.getOperand(0).getReg())
+                .addReg(tmp2);
+        break;
+        }
+      case AMDIL::CMOVLOG_f32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(MI.getOpcode()))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1))
+                .addOperand(MI.getOperand(3))
+                .addOperand(MI.getOperand(2));
+        break;
+
+      case AMDIL::CMOVLOG_i32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1))
+                .addOperand(MI.getOperand(3))
+                .addOperand(MI.getOperand(2));
+        break;
+
+      case AMDIL::CLAMP_f32:
+        {
+          MachineOperand lowOp = MI.getOperand(2);
+          MachineOperand highOp = MI.getOperand(3);
+        if (lowOp.isReg() && highOp.isReg()
+            && lowOp.getReg() == AMDIL::ZERO && highOp.getReg() == AMDIL::ONE) {
+          MI.getOperand(0).addTargetFlag(MO_FLAG_CLAMP);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV))
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+        } else {
+          /* XXX: Handle other cases */
+          abort();
+        }
+        break;
+        }
+
+      case AMDIL::UDIV_i32:
+        divMod(MI, MBB, I);
+        break;
+
+      /* XXX: Figure out the semantics of DIV_INF_f32 and make sure this is OK */
+/*      case AMDIL::DIV_INF_f32:
+        {
+          unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRF32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TM.getInstrInfo()->get(AMDIL::RECIP_CLAMPED), tmp0)
+                  .addOperand(MI.getOperand(2));
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TM.getInstrInfo()->get(AMDIL::MUL_IEEE_f32))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(tmp0)
+                  .addOperand(MI.getOperand(1));
+          break;
+        }
+*/        /* XXX: This is an optimization */
+
+      case AMDIL::GLOBALLOAD_f32:
+      case AMDIL::GLOBALLOAD_i32:
+        {
+          MachineOperand &ptrOperand = MI.getOperand(1);
+          MachineOperand &indexOperand = MI.getOperand(2);
+          unsigned indexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+
+          /* Calculate the address with in the VTX buffer */
+          calcAddress(ptrOperand, indexOperand, indexReg, MBB, I);
+
+          /* Make sure the VTX_READ_eg writes to the X chan */
+          MRI->setRegClass(MI.getOperand(0).getReg(),
+                          &AMDIL::R600_TReg32_XRegClass);
+
+          /* Add the VTX_READ_eg instruction */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::VTX_READ_eg))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(indexReg)
+                  .addImm(1);
+          break;
+        }
+
+      case AMDIL::GLOBALSTORE_i32:
+      case AMDIL::GLOBALSTORE_f32:
+        {
+          MachineOperand &ptrOperand = MI.getOperand(1);
+          MachineOperand &indexOperand = MI.getOperand(2);
+          unsigned rwReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+          unsigned byteIndexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+          unsigned shiftReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+          unsigned indexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+
+          /* Move the store value to the correct register class */
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY), rwReg)
+                  .addOperand(MI.getOperand(0));
+
+          /* Calculate the address in the RAT */
+          calcAddress(ptrOperand, indexOperand, byteIndexReg, MBB, I);
+
+
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV), shiftReg)
+                  .addReg(AMDIL::ALU_LITERAL_X)
+                  .addImm(2);
+
+          /* XXX: Check GPU family */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::LSHR_eg), indexReg)
+                 .addReg(byteIndexReg)
+                 .addReg(shiftReg);
+
+          /* XXX: Check GPU Family */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::RAT_WRITE_CACHELESS_eg))
+                  .addReg(rwReg)
+                  .addReg(indexReg)
+                  .addImm(0);
+          break;
+        }
+      case AMDIL::ILT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGT_INT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+      case AMDIL::LOADCONST_f32:
+      case AMDIL::LOADCONST_i32:
+        {
+          bool canInline = false;
+          unsigned inlineReg;
+          MachineOperand & dstOp = MI.getOperand(0);
+          MachineOperand & immOp = MI.getOperand(1);
+          if (immOp.isFPImm()) {
+            const ConstantFP * cfp = immOp.getFPImm();
+            if (cfp->isZero()) {
+              canInline = true;
+              inlineReg = AMDIL::ZERO;
+            } else if (cfp->isExactlyValue(1.0f)) {
+              canInline = true;
+              inlineReg = AMDIL::ONE;
+            } else if (cfp->isExactlyValue(0.5f)) {
+              canInline = true;
+              inlineReg = AMDIL::HALF;
+            }
+          }
+
+          if (canInline) {
+            MachineOperand * use = dstOp.getNextOperandForReg();
+            /* The lowering operation for CLAMP needs to have the immediates
+             * as operands, so we must propagate them. */
+            while (use) {
+              MachineOperand * next = use->getNextOperandForReg();
+              if (use->getParent()->getOpcode() == AMDIL::CLAMP_f32) {
+                use->setReg(inlineReg);
+              }
+              use = next;
+            }
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY))
+                    .addOperand(dstOp)
+                    .addReg(inlineReg);
+          } else {
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV))
+                    .addOperand(dstOp)
+                    .addReg(AMDIL::ALU_LITERAL_X)
+                    .addOperand(immOp);
+          }
+          break;
+        }
+
+      case AMDIL::MASK_WRITE:
+      {
+        unsigned maskedRegister = MI.getOperand(0).getReg();
+        assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+        MachineInstr * defInstr = MRI->getVRegDef(maskedRegister);
+        MachineOperand * def = defInstr->findRegisterDefOperand(maskedRegister);
+        def->addTargetFlag(MO_FLAG_MASK);
+        break;
+      }
+
+      case AMDIL::NEGATE_i32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT))
+                .addOperand(MI.getOperand(0))
+                .addReg(AMDIL::ZERO)
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::NEG_f32:
+        {
+            MI.getOperand(1).addTargetFlag(MO_FLAG_NEG);
+            BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(TII->getISAOpcode(AMDIL::MOV)))
+            .addOperand(MI.getOperand(0))
+            .addOperand(MI.getOperand(1));
+          break;
+        }
+
+      case AMDIL::SUB_f32:
+        {
+          MI.getOperand(2).addTargetFlag(MO_FLAG_NEG);
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(TII->getISAOpcode(AMDIL::ADD_f32)))
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1))
+                  .addOperand(MI.getOperand(2));
+          break;
+        }
+
+      default:
+        continue;
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+void R600LowerInstructionsPass::calcAddress(const MachineOperand &ptrOp,
+                                            const MachineOperand &indexOp,
+                                            unsigned indexReg,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I) const
+{
+  /* Optimize the case where the indexOperand is 0 */
+  if (indexOp.isImm() && indexOp.getImm() == 0) {
+    assert(ptrOp.isReg());
+    BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(AMDIL::COPY), indexReg)
+            .addOperand(ptrOp);
+  } else {
+    BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(AMDIL::ADD_INT), indexReg)
+            .addOperand(indexOp)
+            .addOperand(ptrOp);
+  }
+}
+
+/* Mostly copied from tgsi_divmod() in r600_shader.c */
+void R600LowerInstructionsPass::divMod(MachineInstr &MI,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       bool div) const
+{
+  unsigned dst = MI.getOperand(0).getReg();
+  MachineOperand &numerator = MI.getOperand(1);
+  MachineOperand &denominator = MI.getOperand(2);
+  /* rcp = RECIP(denominator) = 2^32 / denominator + e
+   * e is rounding error */
+  unsigned rcp = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getRECIP_UINT()), rcp)
+          .addOperand(denominator);
+
+  /* rcp_lo = lo(rcp * denominator) */
+  unsigned rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()), rcp_lo)
+          .addReg(rcp)
+          .addOperand(denominator);
+
+  /* rcp_hi = HI (rcp * denominator) */
+  unsigned rcp_hi = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), rcp_hi)
+          .addReg(rcp)
+          .addOperand(denominator);
+
+  unsigned neg_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), neg_rcp_lo)
+          .addReg(AMDIL::ZERO)
+          .addReg(rcp_lo);
+
+  unsigned abs_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), abs_rcp_lo)
+          .addReg(rcp_hi)
+          .addReg(neg_rcp_lo)
+          .addReg(rcp_lo);
+
+  unsigned e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), e)
+          .addReg(abs_rcp_lo)
+          .addReg(rcp);
+
+  unsigned rcp_plus_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), rcp_plus_e)
+          .addReg(rcp)
+          .addReg(e);
+
+  unsigned rcp_sub_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), rcp_sub_e)
+          .addReg(rcp)
+          .addReg(e);
+
+  /* tmp0 = rcp_hi == 0 ? rcp_plus_e : rcp_sub_e */
+  unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), tmp0)
+          .addReg(rcp_hi)
+          .addReg(rcp_plus_e)
+          .addReg(rcp_sub_e);
+
+  unsigned q = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), q)
+          .addReg(tmp0)
+          .addOperand(numerator);
+
+  /* num_sub_r = q * denominator */
+  unsigned num_sub_r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()),
+          num_sub_r)
+          .addReg(q)
+          .addOperand(denominator);
+
+  unsigned r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), r)
+          .addOperand(numerator)
+          .addReg(num_sub_r);
+
+  unsigned r_ge_den = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_den)
+          .addReg(r)
+          .addOperand(denominator);
+
+  unsigned r_ge_zero = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_zero)
+          .addOperand(numerator)
+          .addReg(num_sub_r);
+
+  unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::AND_INT), tmp1)
+          .addReg(r_ge_den)
+          .addReg(r_ge_zero);
+
+  unsigned val0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  unsigned val1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  unsigned result = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  if (div) {
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val0)
+            .addReg(q)
+            .addReg(AMDIL::ONE_INT);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val1)
+            .addReg(q)
+            .addReg(AMDIL::ONE_INT);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
+            .addReg(tmp1)
+            .addReg(q)
+            .addReg(val0);
+  } else {
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val0)
+            .addReg(r)
+            .addOperand(denominator);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val1)
+            .addReg(r)
+            .addOperand(denominator);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
+            .addReg(tmp1)
+            .addReg(r)
+            .addReg(val0);
+  }
+
+  /* XXX: Do we need to set to MAX_INT if denominator is 0? */
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), dst)
+          .addReg(r_ge_zero)
+          .addReg(val1)
+          .addReg(result);
+}
diff --git a/lib/Target/AMDIL/R600LowerShaderInstructions.cpp b/lib/Target/AMDIL/R600LowerShaderInstructions.cpp
new file mode 100644
index 0000000..394ee70
--- /dev/null
+++ b/lib/Target/AMDIL/R600LowerShaderInstructions.cpp
@@ -0,0 +1,143 @@
+//===-- R600LowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPULowerShaderInstructions.h"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class R600LowerShaderInstructionsPass : public MachineFunctionPass,
+        public AMDGPULowerShaderInstructionsPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+    void lowerEXPORT_REG_FAKE(MachineInstr &MI, MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator I);
+    void lowerLOAD_INPUT(MachineInstr & MI);
+    bool lowerSTORE_OUTPUT(MachineInstr & MI, MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator I);
+
+  public:
+    R600LowerShaderInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+      bool runOnMachineFunction(MachineFunction &MF);
+
+      const char *getPassName() const { return "R600 Lower Shader Instructions"; }
+    };
+} /* End anonymous namespace */
+
+char R600LowerShaderInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createR600LowerShaderInstructionsPass(TargetMachine &tm) {
+    return new R600LowerShaderInstructionsPass(tm);
+}
+
+#define INSTR_CASE_FLOAT_V(inst) \
+  case AMDIL:: inst##_v4f32: \
+
+#define INSTR_CASE_FLOAT_S(inst) \
+  case AMDIL:: inst##_f32:
+
+#define INSTR_CASE_FLOAT(inst) \
+  INSTR_CASE_FLOAT_V(inst) \
+  INSTR_CASE_FLOAT_S(inst)
+bool R600LowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+      MachineInstr &MI = *I;
+      bool deleteInstr = false;
+      switch (MI.getOpcode()) {
+
+      default: break;
+
+      case AMDIL::RESERVE_REG:
+      case AMDIL::EXPORT_REG:
+        deleteInstr = true;
+        break;
+
+      case AMDIL::LOAD_INPUT:
+        lowerLOAD_INPUT(MI);
+        deleteInstr = true;
+        break;
+
+      case AMDIL::STORE_OUTPUT:
+        deleteInstr = lowerSTORE_OUTPUT(MI, MBB, I);
+        break;
+
+      }
+
+      ++I;
+
+      if (deleteInstr) {
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  return false;
+}
+
+/* The goal of this function is to replace the virutal destination register of
+ * a LOAD_INPUT instruction with the correct physical register that will.
+ *
+ * XXX: I don't think this is the right way things assign physical registers,
+ * but I'm not sure of another way to do this.
+ */
+void R600LowerShaderInstructionsPass::lowerLOAD_INPUT(MachineInstr &MI)
+{
+  MachineOperand &dst = MI.getOperand(0);
+  MachineOperand &arg = MI.getOperand(1);
+  int64_t inputIndex = arg.getImm();
+  const TargetRegisterClass * inputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
+  unsigned newRegister = inputClass->getRegister(inputIndex);
+  unsigned dstReg = dst.getReg();
+
+  preloadRegister(MI.getParent()->getParent(), TM.getInstrInfo(), newRegister,
+                  dstReg);
+}
+
+bool R600LowerShaderInstructionsPass::lowerSTORE_OUTPUT(MachineInstr &MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
+{
+  MachineOperand &valueOp = MI.getOperand(1);
+  MachineOperand &indexOp = MI.getOperand(2);
+  unsigned valueReg = valueOp.getReg();
+  int64_t outputIndex = indexOp.getImm();
+  const TargetRegisterClass * outputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
+  unsigned newRegister = outputClass->getRegister(outputIndex);
+
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::COPY),
+                  newRegister)
+                  .addReg(valueReg);
+
+  if (!MRI->isLiveOut(newRegister))
+    MRI->addLiveOut(newRegister);
+
+  return true;
+
+}
diff --git a/lib/Target/AMDIL/R600OpenCLUtils.h b/lib/Target/AMDIL/R600OpenCLUtils.h
new file mode 100644
index 0000000..91e41d6
--- /dev/null
+++ b/lib/Target/AMDIL/R600OpenCLUtils.h
@@ -0,0 +1,49 @@
+//===-- OpenCLUtils.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+#ifndef OPENCLUTILS_H
+#define OPENCLUTILS_H
+
+#include "llvm/Function.h"
+
+#include <llvm/Module.h>
+
+static bool isOpenCLKernel(const llvm::Function* fun)
+{
+  llvm::Module *mod = const_cast<llvm::Function*>(fun)->getParent();
+  llvm::NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
+
+  if (!md or !md->getNumOperands())
+  {
+    return false;
+  }
+
+  for (int i = 0; i < int(md->getNumOperands()); i++)
+  {
+    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
+    {
+      continue;
+    }
+    
+    assert(md->getOperand(i)->getNumOperands() == 1);
+
+    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+#endif
diff --git a/lib/Target/AMDIL/SIAssignInterpRegs.cpp b/lib/Target/AMDIL/SIAssignInterpRegs.cpp
new file mode 100644
index 0000000..6fe29c6
--- /dev/null
+++ b/lib/Target/AMDIL/SIAssignInterpRegs.cpp
@@ -0,0 +1,110 @@
+//===-- SIAssignInterpRegs.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDIL.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class SIAssignInterpRegsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SIAssignInterpRegsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "SI Assign intrpolation registers"; }
+  };
+} // End anonymous namespace
+
+char SIAssignInterpRegsPass::ID = 0;
+
+#define INTERP_VALUES 16
+
+struct interp_info {
+  bool enabled;
+  unsigned regs[3];
+  unsigned reg_count;
+};
+
+
+FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
+  return new SIAssignInterpRegsPass(tm);
+}
+
+bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF)
+{
+
+  struct interp_info InterpUse[INTERP_VALUES] = {
+    {false, {AMDIL::PERSP_SAMPLE_I, AMDIL::PERSP_SAMPLE_J}, 2},
+    {false, {AMDIL::PERSP_CENTER_I, AMDIL::PERSP_CENTER_J}, 2},
+    {false, {AMDIL::PERSP_CENTROID_I, AMDIL::PERSP_CENTROID_J}, 2},
+    {false, {AMDIL::PERSP_I_W, AMDIL::PERSP_J_W, AMDIL::PERSP_1_W}, 3},
+    {false, {AMDIL::LINEAR_SAMPLE_I, AMDIL::LINEAR_SAMPLE_J}, 2},
+    {false, {AMDIL::LINEAR_CENTER_I, AMDIL::LINEAR_CENTER_J}, 2},
+    {false, {AMDIL::LINEAR_CENTROID_I, AMDIL::LINEAR_CENTROID_J}, 2},
+    {false, {AMDIL::LINE_STIPPLE_TEX_COORD}, 1},
+    {false, {AMDIL::POS_X_FLOAT}, 1},
+    {false, {AMDIL::POS_Y_FLOAT}, 1},
+    {false, {AMDIL::POS_Z_FLOAT}, 1},
+    {false, {AMDIL::POS_W_FLOAT}, 1},
+    {false, {AMDIL::FRONT_FACE}, 1},
+    {false, {AMDIL::ANCILLARY}, 1},
+    {false, {AMDIL::SAMPLE_COVERAGE}, 1},
+    {false, {AMDIL::POS_FIXED_PT}, 1}
+  };
+
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  /* First pass, mark the interpolation values that are used. */
+  for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) {
+    for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count;
+                                                               reg_idx++) {
+      InterpUse[interp_idx].enabled =
+                            !MRI.use_empty(InterpUse[interp_idx].regs[reg_idx]);
+    }
+  }
+
+  unsigned used_vgprs = 0;
+
+  /* Second pass, replace with VGPRs. */
+  for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) {
+    if (!InterpUse[interp_idx].enabled) {
+      continue;
+    }
+    MFI->spi_ps_input_addr |= (1 << interp_idx);
+
+    for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count;
+                                                  reg_idx++, used_vgprs++) {
+      unsigned new_reg = AMDIL::VReg_32RegClass.getRegister(used_vgprs);
+      unsigned virt_reg = MRI.createVirtualRegister(&AMDIL::VReg_32RegClass);
+      MRI.replaceRegWith(InterpUse[interp_idx].regs[reg_idx], virt_reg);
+      AMDGPU::utilAddLiveIn(&MF, MRI, TM.getInstrInfo(), new_reg, virt_reg);
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/AMDIL/SICodeEmitter.cpp b/lib/Target/AMDIL/SICodeEmitter.cpp
new file mode 100644
index 0000000..ad494fa
--- /dev/null
+++ b/lib/Target/AMDIL/SICodeEmitter.cpp
@@ -0,0 +1,274 @@
+//===-- SICodeEmitter.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDILCodeEmitter.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <stdio.h>
+
+#define LITERAL_REG 255
+#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
+using namespace llvm;
+
+namespace {
+
+  class SICodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter {
+
+  private:
+    static char ID;
+    formatted_raw_ostream &_OS;
+    const TargetMachine *TM;
+    void emitState(MachineFunction & MF);
+    void emitInstr(MachineInstr &MI);
+
+    void outputBytes(uint64_t value, unsigned bytes);
+    unsigned GPRAlign(const MachineInstr &MI, unsigned OpNo, unsigned shift)
+                                                                      const;
+
+  public:
+    SICodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
+        _OS(OS), TM(NULL) { }
+    const char *getPassName() const { return "SI Code Emitter"; }
+    bool runOnMachineFunction(MachineFunction &MF);
+    virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                       const MachineOperand &MO) const;
+    virtual unsigned GPR4AlignEncode(const MachineInstr  &MI, unsigned OpNo)
+                                                                      const;
+    virtual unsigned GPR2AlignEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                      const;
+    virtual uint64_t i32LiteralEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                      const;
+    virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                   uint64_t Value) const;
+  };
+}
+
+char SICodeEmitter::ID = 0;
+
+FunctionPass *llvm::createSICodeEmitterPass(formatted_raw_ostream &OS) {
+  return new SICodeEmitter(OS);
+}
+
+void SICodeEmitter::emitState(MachineFunction & MF)
+{
+  unsigned maxSGPR = 0;
+  unsigned maxVGPR = 0;
+  bool VCCUsed = false;
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        MachineOperand & MO = MI.getOperand(op_idx);
+        unsigned maxUsed;
+        unsigned width = 0;
+        bool isSGPR = false;
+        unsigned reg;
+        unsigned hwReg;
+        if (!MO.isReg()) {
+          continue;
+        }
+        reg = MO.getReg();
+        if (reg == AMDIL::VCC) {
+          VCCUsed = true;
+          continue;
+        }
+        if (AMDIL::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDIL::VReg_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDIL::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDIL::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDIL::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDIL::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDIL::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else {
+          assert("!Unknown register class");
+        }
+        hwReg = RI->getHWRegNum(reg);
+        maxUsed = ((hwReg + 1) * width) - 1;
+        if (isSGPR) {
+          maxSGPR = maxUsed > maxSGPR ? maxUsed : maxSGPR;
+        } else {
+          maxVGPR = maxUsed > maxVGPR ? maxUsed : maxVGPR;
+        }
+      }
+    }
+  }
+  if (VCCUsed) {
+    maxSGPR += 2;
+  }
+  outputBytes(maxSGPR + 1, 4);
+  outputBytes(maxVGPR + 1, 4);
+  outputBytes(MFI->spi_ps_input_addr, 4);
+}
+
+bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
+{
+  MF.dump();
+  TM = &MF.getTarget();
+  const AMDGPUInstrInfo * TII =
+                        static_cast<const AMDGPUInstrInfo*>(TM->getInstrInfo());
+
+  emitState(MF);
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isRegPreload(MI) && MI.getOpcode() != AMDIL::KILL
+          && MI.getOpcode() != AMDIL::RETURN) {
+        emitInstr(MI);
+      }
+    }
+  }
+  return false;
+}
+
+void SICodeEmitter::emitInstr(MachineInstr &MI)
+{
+  const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo());
+
+  uint64_t hwInst = getBinaryCodeForInstr(MI);
+
+  if ((hwInst & 0xffffffff) == 0xffffffff) {
+    fprintf(stderr, "Unsupported Instruction: \n");
+    MI.dump();
+    abort();
+  }
+
+//  hwInst |= SII->getBinaryCode(MI);
+
+  unsigned bytes = SII->getEncodingBytes(MI);
+  outputBytes(hwInst, bytes);
+}
+
+uint64_t SICodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                          const MachineOperand &MO) const
+{
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+
+  switch(MO.getType()) {
+  case MachineOperand::MO_Register:
+    return RI->getBinaryCode(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MO.getImm();
+
+  case MachineOperand::MO_FPImmediate:
+    /* XXX: Not all instructions can use inline literals */
+    /* XXX: We should make sure this is a 32-bit constant */
+    return LITERAL_REG | (MO.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue() << 32);
+  default:
+    llvm_unreachable("Encoding of this operand type is not supported yet.");
+    break;
+  }
+}
+
+unsigned SICodeEmitter::GPRAlign(const MachineInstr &MI, unsigned OpNo,
+    unsigned shift) const
+{
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+  unsigned regCode = RI->getHWRegNum(MI.getOperand(OpNo).getReg());
+  return regCode >> shift;
+}
+
+unsigned SICodeEmitter::GPR4AlignEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return GPRAlign(MI, OpNo, 2);
+}
+
+unsigned SICodeEmitter::GPR2AlignEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return GPRAlign(MI, OpNo, 1);
+}
+
+uint64_t SICodeEmitter::i32LiteralEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return LITERAL_REG | (MI.getOperand(OpNo).getImm() << 32);
+}
+
+/* Set the "VGPR" bit for VOP args that can take either a VGPR or a SGPR.
+ * XXX: It would be nice if we could handle this without a PostEncode function.
+ */
+uint64_t SICodeEmitter::VOPPostEncode(const MachineInstr &MI,
+    uint64_t Value) const
+{
+  const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo());
+  unsigned encodingType = SII->getEncodingType(MI);
+  unsigned numSrcOps;
+  unsigned vgprBitOffset;
+
+  if (encodingType == SIInstrEncodingType::VOP3) {
+    numSrcOps = 3;
+    vgprBitOffset = 32;
+  } else {
+    numSrcOps = 1;
+    vgprBitOffset = 0;
+  }
+
+  /* Add one to skip over the destination reg operand. */
+  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
+    if (!MI.getOperand(opIdx).isReg()) {
+      continue;
+    }
+    unsigned reg = MI.getOperand(opIdx).getReg();
+    if (AMDIL::VReg_32RegClass.contains(reg)
+        || AMDIL::VReg_64RegClass.contains(reg)) {
+      Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
+    }
+  }
+  return Value;
+}
+
+
+void SICodeEmitter::outputBytes(uint64_t value, unsigned bytes)
+{
+  for (unsigned i = 0; i < bytes; i++) {
+    _OS.write((uint8_t) ((value >> (8 * i)) & 0xff));
+  }
+}
diff --git a/lib/Target/AMDIL/SILowerShaderInstructions.cpp b/lib/Target/AMDIL/SILowerShaderInstructions.cpp
new file mode 100644
index 0000000..5d49d88
--- /dev/null
+++ b/lib/Target/AMDIL/SILowerShaderInstructions.cpp
@@ -0,0 +1,90 @@
+//===-- SILowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPULowerShaderInstructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class SILowerShaderInstructionsPass : public MachineFunctionPass,
+      public AMDGPULowerShaderInstructionsPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SILowerShaderInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "SI Lower Shader Instructions"; }
+
+    void lowerRETURN(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+    void lowerSET_M0(MachineInstr &MI, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I);
+  };
+} /* End anonymous namespace */
+
+char SILowerShaderInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createSILowerShaderInstructionsPass(TargetMachine &tm) {
+    return new SILowerShaderInstructionsPass(tm);
+}
+
+bool SILowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      case AMDIL::RETURN:
+        lowerRETURN(MBB, I);
+        break;
+      case AMDIL::SET_M0:
+        lowerSET_M0(MI, MBB, I);
+        break;
+      default: continue;
+      }
+      MI.removeFromParent();
+    }
+  }
+
+  return false;
+}
+
+void SILowerShaderInstructionsPass::lowerRETURN(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I)
+{
+  const struct TargetInstrInfo * TII = TM.getInstrInfo();
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_ENDPGM));
+}
+
+void SILowerShaderInstructionsPass::lowerSET_M0(MachineInstr &MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
+{
+  const struct TargetInstrInfo * TII = TM.getInstrInfo();
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_MOV_IMM_I32))
+          .addReg(AMDIL::M0)
+          .addOperand(MI.getOperand(1));
+}
diff --git a/lib/Target/AMDIL/SIPropagateImmReads.cpp b/lib/Target/AMDIL/SIPropagateImmReads.cpp
new file mode 100644
index 0000000..4f925d5
--- /dev/null
+++ b/lib/Target/AMDIL/SIPropagateImmReads.cpp
@@ -0,0 +1,70 @@
+//===-- SIPropagateImmReads.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class SIPropagateImmReadsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SIPropagateImmReadsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+  };
+} /* End anonymous namespace */
+
+char SIPropagateImmReadsPass::ID = 0;
+
+FunctionPass *llvm::createSIPropagateImmReadsPass(TargetMachine &tm) {
+  return new SIPropagateImmReadsPass(tm);
+}
+
+bool SIPropagateImmReadsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const SIInstrInfo * TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I)) {
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      case AMDIL::LOADCONST_f32:
+      case AMDIL::LOADCONST_i32:
+        break;
+      default:
+        continue;
+      }
+
+      /* XXX: Create and use S_MOV_IMM for SREGs */
+      BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::V_MOV_IMM))
+          .addOperand(MI.getOperand(0))
+          .addOperand(MI.getOperand(1));
+
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
-- 
1.7.7.6