[llvm-commits] CVS: llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp

Fri Apr 13 13:31:13 PDT 2007

Changes in directory llvm/lib/Transforms/Scalar:

CodeGenPrepare.cpp updated: 1.3 -> 1.4
---
Log message:

Completely rewrite addressing-mode related sinking of code.  In particular,
this fixes problems where codegenprepare would sink expressions into load/stores
that are not valid, and fixes cases where it would miss important valid ones.

This fixes several serious codesize and perf issues, particularly on targets
with complex addressing modes like arm and x86.  For example, now we compile 
CodeGen/X86/isel-sink.ll to:

_test:
        movl 8(%esp), %eax
        movl 4(%esp), %ecx
        cmpl $1233, %eax
        ja LBB1_2       #F
LBB1_1: #T
        movl $4, (%ecx,%eax,4)
        movl $141, %eax
        ret
LBB1_2: #F
        movl (%ecx,%eax,4), %eax
        ret

instead of:

_test:
        movl 8(%esp), %eax
        leal (,%eax,4), %ecx
        addl 4(%esp), %ecx
        cmpl $1233, %eax
        ja LBB1_2       #F
LBB1_1: #T
        movl $4, (%ecx)
        movl $141, %eax
        ret
LBB1_2: #F
        movl (%ecx), %eax
        ret



---
Diffs of the changes:  (+543 -342)

 CodeGenPrepare.cpp |  885 ++++++++++++++++++++++++++++++++---------------------
 1 files changed, 543 insertions(+), 342 deletions(-)


Index: llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
diff -u llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp:1.3 llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp:1.4

--- llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp:1.3	Mon Apr  9 18:29:07 2007
+++ llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp	Fri Apr 13 15:30:56 2007
@@ -25,9 +25,12 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
 using namespace llvm;
 
 namespace {  
@@ -44,7 +47,9 @@
     bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
     bool OptimizeBlock(BasicBlock &BB);
-    bool OptimizeGEPExpression(GetElementPtrInst *GEPI);
+    bool OptimizeLoadStoreInst(Instruction *I, Value *Addr,
+                               const Type *AccessTy,
+                               DenseMap<Value*,Value*> &SunkAddrs);
   };
 }
 static RegisterPass<CodeGenPrepare> X("codegenprepare",
@@ -291,312 +296,37 @@
   SplitCriticalEdge(TI, SuccNum, P, true);  
 }
 
-
-/// InsertGEPComputeCode - Insert code into BB to compute Ptr+PtrOffset,
-/// casting to the type of GEPI.
-static Instruction *InsertGEPComputeCode(Instruction *&V, BasicBlock *BB,
-                                         Instruction *GEPI, Value *Ptr,
-                                         Value *PtrOffset) {
-  if (V) return V;   // Already computed.
-  
-  // Figure out the insertion point
-  BasicBlock::iterator InsertPt;
-  if (BB == GEPI->getParent()) {
-    // If GEP is already inserted into BB, insert right after the GEP.
-    InsertPt = GEPI;
-    ++InsertPt;
-  } else {
-    // Otherwise, insert at the top of BB, after any PHI nodes
-    InsertPt = BB->begin();
-    while (isa<PHINode>(InsertPt)) ++InsertPt;
-  }
-  
-  // If Ptr is itself a cast, but in some other BB, emit a copy of the cast into
-  // BB so that there is only one value live across basic blocks (the cast 
-  // operand).
-  if (CastInst *CI = dyn_cast<CastInst>(Ptr))
-    if (CI->getParent() != BB && isa<PointerType>(CI->getOperand(0)->getType()))
-      Ptr = CastInst::create(CI->getOpcode(), CI->getOperand(0), CI->getType(),
-                             "", InsertPt);
-  
-  // Add the offset, cast it to the right type.
-  Ptr = BinaryOperator::createAdd(Ptr, PtrOffset, "", InsertPt);
-  // Ptr is an integer type, GEPI is pointer type ==> IntToPtr
-  return V = CastInst::create(Instruction::IntToPtr, Ptr, GEPI->getType(), 
-                              "", InsertPt);
-}
-
-/// ReplaceUsesOfGEPInst - Replace all uses of RepPtr with inserted code to
-/// compute its value.  The RepPtr value can be computed with Ptr+PtrOffset. One
-/// trivial way of doing this would be to evaluate Ptr+PtrOffset in RepPtr's
-/// block, then ReplaceAllUsesWith'ing everything.  However, we would prefer to
-/// sink PtrOffset into user blocks where doing so will likely allow us to fold
-/// the constant add into a load or store instruction.  Additionally, if a user
-/// is a pointer-pointer cast, we look through it to find its users.
-static void ReplaceUsesOfGEPInst(Instruction *RepPtr, Value *Ptr, 
-                                 Constant *PtrOffset, BasicBlock *DefBB,
-                                 GetElementPtrInst *GEPI,
-                           std::map<BasicBlock*,Instruction*> &InsertedExprs) {
-  while (!RepPtr->use_empty()) {
-    Instruction *User = cast<Instruction>(RepPtr->use_back());
-    
-    // If the user is a Pointer-Pointer cast, recurse. Only BitCast can be
-    // used for a Pointer-Pointer cast.
-    if (isa<BitCastInst>(User)) {
-      ReplaceUsesOfGEPInst(User, Ptr, PtrOffset, DefBB, GEPI, InsertedExprs);
-      
-      // Drop the use of RepPtr. The cast is dead.  Don't delete it now, else we
-      // could invalidate an iterator.
-      User->setOperand(0, UndefValue::get(RepPtr->getType()));
-      continue;
-    }
-    
-    // If this is a load of the pointer, or a store through the pointer, emit
-    // the increment into the load/store block.
-    Instruction *NewVal;
-    if (isa<LoadInst>(User) ||
-        (isa<StoreInst>(User) && User->getOperand(0) != RepPtr)) {
-      NewVal = InsertGEPComputeCode(InsertedExprs[User->getParent()], 
-                                    User->getParent(), GEPI,
-                                    Ptr, PtrOffset);
-    } else {
-      // If this use is not foldable into the addressing mode, use a version 
-      // emitted in the GEP block.
-      NewVal = InsertGEPComputeCode(InsertedExprs[DefBB], DefBB, GEPI, 
-                                    Ptr, PtrOffset);
-    }
-    
-    if (GEPI->getType() != RepPtr->getType()) {
-      BasicBlock::iterator IP = NewVal;
-      ++IP;
-      // NewVal must be a GEP which must be pointer type, so BitCast
-      NewVal = new BitCastInst(NewVal, RepPtr->getType(), "", IP);
-    }
-    User->replaceUsesOfWith(RepPtr, NewVal);
-  }
-}
-
-/// OptimizeGEPExpression - Since we are doing basic-block-at-a-time instruction
-/// selection, we want to be a bit careful about some things.  In particular, if
-/// we have a GEP instruction that is used in a different block than it is
-/// defined, the addressing expression of the GEP cannot be folded into loads or
-/// stores that use it.  In this case, decompose the GEP and move constant
-/// indices into blocks that use it.
-bool CodeGenPrepare::OptimizeGEPExpression(GetElementPtrInst *GEPI) {
-  // If this GEP is only used inside the block it is defined in, there is no
-  // need to rewrite it.
-  bool isUsedOutsideDefBB = false;
-  BasicBlock *DefBB = GEPI->getParent();
-  for (Value::use_iterator UI = GEPI->use_begin(), E = GEPI->use_end(); 
-       UI != E; ++UI) {
-    if (cast<Instruction>(*UI)->getParent() != DefBB) {
-      isUsedOutsideDefBB = true;
-      break;
-    }
-  }
-  if (!isUsedOutsideDefBB) return false;
-
-  // If this GEP has no non-zero constant indices, there is nothing we can do,
-  // ignore it.
-  bool hasConstantIndex = false;
-  bool hasVariableIndex = false;
-  for (GetElementPtrInst::op_iterator OI = GEPI->op_begin()+1,
-       E = GEPI->op_end(); OI != E; ++OI) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(*OI)) {
-      if (!CI->isZero()) {
-        hasConstantIndex = true;
-        break;
-      }
-    } else {
-      hasVariableIndex = true;
-    }
-  }
-  
-  // If this is a "GEP X, 0, 0, 0", turn this into a cast.
-  if (!hasConstantIndex && !hasVariableIndex) {
-    /// The GEP operand must be a pointer, so must its result -> BitCast
-    Value *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), 
-                                GEPI->getName(), GEPI);
-    GEPI->replaceAllUsesWith(NC);
-    GEPI->eraseFromParent();
-    return true;
-  }
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, int->uint, or
+/// int->sbyte on PPC), sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coallesced.
+///
+/// Return true if any changes are made.
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy, 
+  MVT::ValueType SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  MVT::ValueType DstVT = TLI.getValueType(CI->getType());
   
-  // If this is a GEP &Alloca, 0, 0, forward subst the frame index into uses.
-  if (!hasConstantIndex && !isa<AllocaInst>(GEPI->getOperand(0)))
+  // This is an fp<->int conversion?
+  if (MVT::isInteger(SrcVT) != MVT::isInteger(DstVT))
     return false;
-
-  // If we don't have target lowering info, we can't lower the GEP.
-  if (!TLI) return false;
-  const TargetData *TD = TLI->getTargetData();
-
-  // Otherwise, decompose the GEP instruction into multiplies and adds.  Sum the
-  // constant offset (which we now know is non-zero) and deal with it later.
-  uint64_t ConstantOffset = 0;
-  const Type *UIntPtrTy = TD->getIntPtrType();
-  Value *Ptr = new PtrToIntInst(GEPI->getOperand(0), UIntPtrTy, "", GEPI);
-  const Type *Ty = GEPI->getOperand(0)->getType();
-
-  for (GetElementPtrInst::op_iterator OI = GEPI->op_begin()+1,
-       E = GEPI->op_end(); OI != E; ++OI) {
-    Value *Idx = *OI;
-    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
-      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
-      if (Field)
-        ConstantOffset += TD->getStructLayout(StTy)->getElementOffset(Field);
-      Ty = StTy->getElementType(Field);
-    } else {
-      Ty = cast<SequentialType>(Ty)->getElementType();
-
-      // Handle constant subscripts.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->getZExtValue() == 0) continue;
-        ConstantOffset += (int64_t)TD->getTypeSize(Ty)*CI->getSExtValue();
-        continue;
-      }
-      
-      // Ptr = Ptr + Idx * ElementSize;
-      
-      // Cast Idx to UIntPtrTy if needed.
-      Idx = CastInst::createIntegerCast(Idx, UIntPtrTy, true/*SExt*/, "", GEPI);
-      
-      uint64_t ElementSize = TD->getTypeSize(Ty);
-      // Mask off bits that should not be set.
-      ElementSize &= ~0ULL >> (64-UIntPtrTy->getPrimitiveSizeInBits());
-      Constant *SizeCst = ConstantInt::get(UIntPtrTy, ElementSize);
-
-      // Multiply by the element size and add to the base.
-      Idx = BinaryOperator::createMul(Idx, SizeCst, "", GEPI);
-      Ptr = BinaryOperator::createAdd(Ptr, Idx, "", GEPI);
-    }
-  }
   
-  // Make sure that the offset fits in uintptr_t.
-  ConstantOffset &= ~0ULL >> (64-UIntPtrTy->getPrimitiveSizeInBits());
-  Constant *PtrOffset = ConstantInt::get(UIntPtrTy, ConstantOffset);
-  
-  // Okay, we have now emitted all of the variable index parts to the BB that
-  // the GEP is defined in.  Loop over all of the using instructions, inserting
-  // an "add Ptr, ConstantOffset" into each block that uses it and update the
-  // instruction to use the newly computed value, making GEPI dead.  When the
-  // user is a load or store instruction address, we emit the add into the user
-  // block, otherwise we use a canonical version right next to the gep (these 
-  // won't be foldable as addresses, so we might as well share the computation).
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT < DstVT) return false;
+  
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+    SrcVT = TLI.getTypeToTransformTo(SrcVT);
+  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+    DstVT = TLI.getTypeToTransformTo(DstVT);
   
-  std::map<BasicBlock*,Instruction*> InsertedExprs;
-  ReplaceUsesOfGEPInst(GEPI, Ptr, PtrOffset, DefBB, GEPI, InsertedExprs);
-  
-  // Finally, the GEP is dead, remove it.
-  GEPI->eraseFromParent();
-  
-  return true;
-}
-
-/// SinkInvariantGEPIndex - If a GEP instruction has a variable index that has
-/// been hoisted out of the loop by LICM pass, sink it back into the use BB
-/// if it can be determined that the index computation can be folded into the
-/// addressing mode of the load / store uses.
-static bool SinkInvariantGEPIndex(BinaryOperator *BinOp,
-                                  const TargetLowering &TLI) {
-  // Only look at Add.
-  if (BinOp->getOpcode() != Instruction::Add)
-    return false;
-
-  // DestBBs - These are the blocks where a copy of BinOp will be inserted.
-  SmallSet<BasicBlock*, 8> DestBBs;
-  BasicBlock *DefBB = BinOp->getParent();
-  bool MadeChange = false;
-  for (Value::use_iterator UI = BinOp->use_begin(), E = BinOp->use_end(); 
-       UI != E; ++UI) {
-    Instruction *GEPI = cast<Instruction>(*UI);
-    // Only look for GEP use in another block.
-    if (GEPI->getParent() == DefBB) continue;
-
-    if (isa<GetElementPtrInst>(GEPI)) {
-      // If the GEP has another variable index, abondon.
-      bool hasVariableIndex = false;
-      for (GetElementPtrInst::op_iterator OI = GEPI->op_begin()+1,
-             OE = GEPI->op_end(); OI != OE; ++OI)
-        if (*OI != BinOp && !isa<ConstantInt>(*OI)) {
-          hasVariableIndex = true;
-          break;
-        }
-      if (hasVariableIndex)
-        break;
-
-      BasicBlock *GEPIBB = GEPI->getParent();
-      for (Value::use_iterator UUI = GEPI->use_begin(), UE = GEPI->use_end(); 
-           UUI != UE; ++UUI) {
-        Instruction *GEPIUser = cast<Instruction>(*UUI);
-        const Type *UseTy = NULL;
-        if (LoadInst *Load = dyn_cast<LoadInst>(GEPIUser))
-          UseTy = Load->getType();
-        else if (StoreInst *Store = dyn_cast<StoreInst>(GEPIUser))
-          UseTy = Store->getOperand(0)->getType();
-
-        // Check if it is possible to fold the expression to address mode.
-        if (UseTy && isa<ConstantInt>(BinOp->getOperand(1))) {
-          int64_t Cst = cast<ConstantInt>(BinOp->getOperand(1))->getSExtValue();
-          // e.g. load (gep i32 * %P, (X+42)) => load (%P + X*4 + 168).
-          TargetLowering::AddrMode AM;
-          // FIXME: This computation isn't right, scale is incorrect.
-          AM.Scale = TLI.getTargetData()->getTypeSize(UseTy);
-          // FIXME: Should should also include other fixed offsets.
-          AM.BaseOffs = Cst*AM.Scale;
-          
-          if (TLI.isLegalAddressingMode(AM, UseTy)) {
-            DestBBs.insert(GEPIBB);
-            MadeChange = true;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // Nothing to do.
-  if (!MadeChange)
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
     return false;
-
-  /// InsertedOps - Only insert a duplicate in each block once.
-  std::map<BasicBlock*, BinaryOperator*> InsertedOps;
-  for (Value::use_iterator UI = BinOp->use_begin(), E = BinOp->use_end(); 
-       UI != E; ) {
-    Instruction *User = cast<Instruction>(*UI);
-    BasicBlock *UserBB = User->getParent();
-
-    // Preincrement use iterator so we don't invalidate it.
-    ++UI;
-
-    // If any user in this BB wants it, replace all the uses in the BB.
-    if (DestBBs.count(UserBB)) {
-      // Sink it into user block.
-      BinaryOperator *&InsertedOp = InsertedOps[UserBB];
-      if (!InsertedOp) {
-        BasicBlock::iterator InsertPt = UserBB->begin();
-        while (isa<PHINode>(InsertPt)) ++InsertPt;
-      
-        InsertedOp =
-          BinaryOperator::create(BinOp->getOpcode(), BinOp->getOperand(0),
-                                 BinOp->getOperand(1), "", InsertPt);
-      }
-
-      User->replaceUsesOfWith(BinOp, InsertedOp);
-    }
-  }
-
-  if (BinOp->use_empty())
-      BinOp->eraseFromParent();
-
-  return true;
-}
-
-/// OptimizeNoopCopyExpression - We have determined that the specified cast
-/// instruction is a noop copy (e.g. it's casting from one pointer type to
-/// another, int->uint, or int->sbyte on PPC.
-///
-/// Return true if any changes are made.
-static bool OptimizeNoopCopyExpression(CastInst *CI) {
+  
   BasicBlock *DefBB = CI->getParent();
   
   /// InsertedCasts - Only insert a cast in each block once.
@@ -646,7 +376,468 @@
   return MadeChange;
 }
 
+/// EraseDeadInstructions - Erase any dead instructions
+static void EraseDeadInstructions(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty()) return;
+  
+  SmallPtrSet<Instruction*, 16> Insts;
+  Insts.insert(I);
+  
+  while (!Insts.empty()) {
+    I = *Insts.begin();
+    Insts.erase(I);
+    if (isInstructionTriviallyDead(I)) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
+          Insts.insert(U);
+      I->eraseFromParent();
+    }
+  }
+}
+
 
+/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode which
+/// holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg;
+  Value *ScaledReg;
+  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
+  void dump() const;
+};
+
+static std::ostream &operator<<(std::ostream &OS, const ExtAddrMode &AM) {
+  bool NeedPlus = false;
+  OS << "[";
+  if (AM.BaseGV)
+    OS << (NeedPlus ? " + " : "")
+       << "GV:%" << AM.BaseGV->getName(), NeedPlus = true;
+  
+  if (AM.BaseOffs)
+    OS << (NeedPlus ? " + " : "") << AM.BaseOffs, NeedPlus = true;
+  
+  if (AM.BaseReg)
+    OS << (NeedPlus ? " + " : "")
+       << "Base:%" << AM.BaseReg->getName(), NeedPlus = true;
+  if (AM.Scale)
+    OS << (NeedPlus ? " + " : "")
+       << AM.Scale << "*%" << AM.ScaledReg->getName(), NeedPlus = true;
+  
+  return OS << "]";
+}
+
+void ExtAddrMode::dump() const {
+  cerr << *this << "\n";
+}
+
+static bool TryMatchingScaledValue(Value *ScaleReg, int64_t Scale,
+                                   const Type *AccessTy, ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                   const TargetLowering &TLI, unsigned Depth);
+  
+/// FindMaximalLegalAddressingMode - If we can, try to merge the computation of
+/// Addr into the specified addressing mode.  If Addr can't be added to AddrMode
+/// this returns false.  This assumes that Addr is either a pointer type or
+/// intptr_t for the target.
+static bool FindMaximalLegalAddressingMode(Value *Addr, const Type *AccessTy,
+                                           ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                           const TargetLowering &TLI,
+                                           unsigned Depth) {
+  
+  // If this is a global variable, fold it into the addressing mode if possible.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    return true;
+  }
+  
+  // Look through constant exprs and instructions.
+  unsigned Opcode = ~0U;
+  User *AddrInst = 0;
+  if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    Opcode = I->getOpcode();
+    AddrInst = I;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    Opcode = CE->getOpcode();
+    AddrInst = CE;
+  }
+
+  // Limit recursion to avoid exponential behavior.
+  if (Depth == 5) { AddrInst = 0; Opcode = ~0U; }
+
+  // If this is really an instruction, add it to our list of related
+  // instructions.
+  if (Instruction *I = dyn_cast_or_null<Instruction>(AddrInst))
+    AddrModeInsts.push_back(I);
+
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth))
+      return true;
+    break;
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy()) {
+      if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                         AddrMode, AddrModeInsts, TLI, Depth))
+        return true;
+    }
+    break;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(1), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1) &&
+        FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1))
+      return true;
+
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1) &&
+        FindMaximalLegalAddressingMode(AddrInst->getOperand(1), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;    
+  }
+  case Instruction::Or: {
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) break;
+    // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+    break;
+  }
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C, and can only handle this when the scale
+    // field is available.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) break;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1 << Scale;
+    
+    if (TryMatchingScaledValue(AddrInst->getOperand(0), Scale, AccessTy,
+                               AddrMode, AddrModeInsts, TLI, Depth))
+      return true;
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const TargetData *TD = TLI.getTargetData();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1) {
+            VariableOperand = -2;
+            break;
+          }
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+
+    // If the GEP had multiple variable indices, punt.
+    if (VariableOperand == -2)
+      break;
+
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                           AddrMode, AddrModeInsts, TLI,
+                                           Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+    } else {
+      // Check that this has no base reg yet.  If so, we won't have a place to
+      // put the base of the GEP (assuming it is not a null ptr).
+      bool SetBaseReg = false;
+      if (AddrMode.HasBaseReg) {
+        if (!isa<ConstantPointerNull>(AddrInst->getOperand(0)))
+          break;
+      } else {
+        AddrMode.HasBaseReg = true;
+        AddrMode.BaseReg = AddrInst->getOperand(0);
+        SetBaseReg = true;
+      }
+      
+      // See if the scale amount is valid for this target.
+      AddrMode.BaseOffs += ConstantOffset;
+      if (TryMatchingScaledValue(AddrInst->getOperand(VariableOperand),
+                                 VariableScale, AccessTy, AddrMode, 
+                                 AddrModeInsts, TLI, Depth)) {
+        if (!SetBaseReg) return true;
+
+        // If this match succeeded, we know that we can form an address with the
+        // GepBase as the basereg.  See if we can match *more*.
+        AddrMode.HasBaseReg = false;
+        AddrMode.BaseReg = 0;
+        if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                           AddrMode, AddrModeInsts, TLI,
+                                           Depth+1))
+          return true;
+        // Strange, shouldn't happen.  Restore the base reg and succeed the easy
+        // way.        
+        AddrMode.HasBaseReg = true;
+        AddrMode.BaseReg = AddrInst->getOperand(0);
+        return true;
+      }
+      
+      AddrMode.BaseOffs -= ConstantOffset;
+      if (SetBaseReg) {
+        AddrMode.HasBaseReg = false;
+        AddrMode.BaseReg = 0;
+      }
+    }
+    break;    
+  }
+  }
+  
+  if (Instruction *I = dyn_cast_or_null<Instruction>(AddrInst)) {
+    assert(AddrModeInsts.back() == I && "Stack imbalance");
+    AddrModeInsts.pop_back();
+  }
+  
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+      AddrMode.BaseReg = Addr;
+      return true;
+    }
+    AddrMode.HasBaseReg = false;
+  }
+  
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+      AddrMode.ScaledReg = Addr;
+      return true;
+    }
+    AddrMode.Scale = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+/// TryMatchingScaledValue - Try adding ScaleReg*Scale to the specified
+/// addressing mode.  Return true if this addr mode is legal for the target,
+/// false if not.
+static bool TryMatchingScaledValue(Value *ScaleReg, int64_t Scale,
+                                   const Type *AccessTy, ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                   const TargetLowering &TLI, unsigned Depth) {
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+  
+  ExtAddrMode InputAddrMode = AddrMode;
+  
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  AddrMode.Scale += Scale;
+  AddrMode.ScaledReg = ScaleReg;
+  
+  if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+    // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+    // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+    // X*Scale + C*Scale to addr mode.
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(ScaleReg);
+    if (BinOp && BinOp->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(BinOp->getOperand(1)) && InputAddrMode.ScaledReg ==0) {
+      
+      InputAddrMode.Scale = Scale;
+      InputAddrMode.ScaledReg = BinOp->getOperand(0);
+      InputAddrMode.BaseOffs += 
+        cast<ConstantInt>(BinOp->getOperand(1))->getSExtValue()*Scale;
+      if (TLI.isLegalAddressingMode(InputAddrMode, AccessTy)) {
+        AddrModeInsts.push_back(BinOp);
+        AddrMode = InputAddrMode;
+        return true;
+      }
+    }
+
+    // Otherwise, not (x+c)*scale, just return what we have.
+    return true;
+  }
+  
+  // Otherwise, back this attempt out.
+  AddrMode.Scale -= Scale;
+  if (AddrMode.Scale == 0) AddrMode.ScaledReg = 0;
+  
+  return false;
+}
+
+
+/// IsNonLocalValue - Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// OptimizeLoadStoreInst - Load and Store Instructions have often have
+/// addressing modes that can do significant amounts of computation.  As such,
+/// instruction selection will try to get the load or store to do as much
+/// computation as possible for the program.  The problem is that isel can only
+/// see within a single block.  As such, we sink as much legal addressing mode
+/// stuff into the block as possible.
+bool CodeGenPrepare::OptimizeLoadStoreInst(Instruction *LdStInst, Value *Addr,
+                                           const Type *AccessTy,
+                                           DenseMap<Value*,Value*> &SunkAddrs) {
+  // Figure out what addressing mode will be built up for this operation.
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode;
+  bool Success = FindMaximalLegalAddressingMode(Addr, AccessTy, AddrMode,
+                                                AddrModeInsts, *TLI, 0);
+  Success = Success; assert(Success && "Couldn't select *anything*?");
+  
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], LdStInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+  
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(cerr << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+  
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  BasicBlock::iterator InsertPt = LdStInst;
+  
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(cerr << "CGP: Reusing nonlocal addrmode: " << AddrMode << "\n");
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+  } else {
+    DEBUG(cerr << "CGP: SINKING nonlocal addrmode: " << AddrMode << "\n");
+    const Type *IntPtrTy = TLI->getTargetData()->getIntPtrType();
+    
+    Value *Result = 0;
+    // Start with the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (isa<PointerType>(V->getType())) {
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else {
+        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      }
+      if (AddrMode.Scale != 1)
+        V = BinaryOperator::createMul(V, ConstantInt::get(IntPtrTy,
+                                                          AddrMode.Scale),
+                                      "sunkaddr", InsertPt);
+      Result = V;
+    }
+
+    // Add in the base register.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType() != IntPtrTy)
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+    
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
+                                  InsertPt);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+    
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    if (Result == 0)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+  }
+  
+  LdStInst->replaceUsesOfWith(Addr, SunkAddr);
+  
+  if (Addr->use_empty())
+    EraseDeadInstructions(Addr);
+  return true;
+}
 
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
@@ -665,21 +856,15 @@
   }
   
   
+  // Keep track of non-local addresses that have been sunk into this block.
+  // This allows us to avoid inserting duplicate code for blocks with multiple
+  // load/stores of the same address.
+  DenseMap<Value*, Value*> SunkAddrs;
+  
   for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E; ) {
     Instruction *I = BBI++;
     
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      // If we found an inline asm expession, and if the target knows how to
-      // lower it to normal LLVM code, do so now.
-      if (TLI && isa<InlineAsm>(CI->getCalledValue()))
-        if (const TargetAsmInfo *TAI = 
-            TLI->getTargetMachine().getTargetAsmInfo()) {
-          if (TAI->ExpandInlineAsm(CI))
-            BBI = BB.begin();
-        }
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
-      MadeChange |= OptimizeGEPExpression(GEPI);
-    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    if (CastInst *CI = dyn_cast<CastInst>(I)) {
       // If the source of the cast is a constant, then this should have
       // already been constant folded.  The only reason NOT to constant fold
       // it is if something (e.g. LSR) was careful to place the constant
@@ -689,37 +874,53 @@
       if (isa<Constant>(CI->getOperand(0)))
         continue;
       
-      if (!TLI) continue;
-      
-      // If this is a noop copy, sink it into user blocks to reduce the number
-      // of virtual registers that must be created and coallesced.
-      MVT::ValueType SrcVT = TLI->getValueType(CI->getOperand(0)->getType());
-      MVT::ValueType DstVT = TLI->getValueType(CI->getType());
-      
-      // This is an fp<->int conversion?
-      if (MVT::isInteger(SrcVT) != MVT::isInteger(DstVT))
-        continue;
-      
-      // If this is an extension, it will be a zero or sign extension, which
-      // isn't a noop.
-      if (SrcVT < DstVT) continue;
-      
-      // If these values will be promoted, find out what they will be promoted
-      // to.  This helps us consider truncates on PPC as noop copies when they
-      // are.
-      if (TLI->getTypeAction(SrcVT) == TargetLowering::Promote)
-        SrcVT = TLI->getTypeToTransformTo(SrcVT);
-      if (TLI->getTypeAction(DstVT) == TargetLowering::Promote)
-        DstVT = TLI->getTypeToTransformTo(DstVT);
-      
-      // If, after promotion, these are the same types, this is a noop copy.
-      if (SrcVT == DstVT)
-        MadeChange |= OptimizeNoopCopyExpression(CI);
-    } else if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I)) {
       if (TLI)
-        MadeChange |= SinkInvariantGEPIndex(BinOp, *TLI);
+        MadeChange |= OptimizeNoopCopyExpression(CI, *TLI);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeLoadStoreInst(I, I->getOperand(0), LI->getType(),
+                                            SunkAddrs);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeLoadStoreInst(I, SI->getOperand(1),
+                                            SI->getOperand(0)->getType(),
+                                            SunkAddrs);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      bool HasNonZeroIdx = false;
+      for (GetElementPtrInst::op_iterator OI = GEPI->op_begin()+1,
+           E = GEPI->op_end(); OI != E; ++OI) {
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(*OI)) {
+          if (!CI->isZero()) {
+            HasNonZeroIdx = true;
+            break;
+          }
+        } else {
+          HasNonZeroIdx = true;
+          break;
+        }
+      }
+      
+      if (!HasNonZeroIdx) {
+        /// The GEP operand must be a pointer, so must its result -> BitCast
+        Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), 
+                                          GEPI->getName(), GEPI);
+        GEPI->replaceAllUsesWith(NC);
+        GEPI->eraseFromParent();
+        MadeChange = true;
+        BBI = NC;
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      // If we found an inline asm expession, and if the target knows how to
+      // lower it to normal LLVM code, do so now.
+      if (TLI && isa<InlineAsm>(CI->getCalledValue()))
+        if (const TargetAsmInfo *TAI = 
+            TLI->getTargetMachine().getTargetAsmInfo()) {
+          if (TAI->ExpandInlineAsm(CI))
+            BBI = BB.begin();
+        }
     }
   }
+    
   return MadeChange;
 }