Sun Jul 6 13:45:51 PDT 2008

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================

--- llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.cpp Sun Jul  6 15:45:41 2008
@@ -25,7 +25,6 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/VectorExtras.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -41,7 +40,10 @@
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
-X86TargetLowering::X86TargetLowering(TargetMachine &TM)
+// Forward declarations.
+static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG);
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
@@ -264,6 +266,8 @@
   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
@@ -287,18 +291,22 @@
   if (!Subtarget->hasSSE2())
     setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
 
-  setOperationAction(ISD::ATOMIC_LCS     , MVT::i8, Custom);
-  setOperationAction(ISD::ATOMIC_LCS     , MVT::i16, Custom);
-  setOperationAction(ISD::ATOMIC_LCS     , MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_LCS     , MVT::i64, Custom);
+  // Expand certain atomics
+  setOperationAction(ISD::ATOMIC_CMP_SWAP , MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP , MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP , MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP , MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB , MVT::i32, Expand);
 
-  // Use the default ISD::LOCATION, ISD::DECLARE expansion.
-  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
   // FIXME - use subtarget debug flags
   if (!Subtarget->isTargetDarwin() &&
       !Subtarget->isTargetELF() &&
-      !Subtarget->isTargetCygMing())
-    setOperationAction(ISD::LABEL, MVT::Other, Expand);
+      !Subtarget->isTargetCygMing()) {
+    setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
 
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
@@ -320,12 +328,14 @@
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
-  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-  if (Subtarget->is64Bit())
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
-  else
+  } else {
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
@@ -484,49 +494,51 @@
   // will selectively turn on ones that can be effectively codegen'd.
   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FNEG, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SDIVREM, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::UDIVREM, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::FPOW, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::CTPOP, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::CTTZ, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::CTLZ, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SHL, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SRA, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::SRL, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::ROTL, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::ROTR, (MVT::ValueType)VT, Expand);
-    setOperationAction(ISD::BSWAP, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
   }
 
   if (Subtarget->hasMMX()) {
     addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
     addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
     addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
     addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
 
     // FIXME: add MMX packed arithmetics
@@ -574,11 +586,14 @@
     AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
     setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
     AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
     setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
@@ -605,6 +620,7 @@
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4f32, Legal);
   }
 
   if (Subtarget->hasSSE2()) {
@@ -630,6 +646,12 @@
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 
+    setOperationAction(ISD::VSETCC,             MVT::v2f64, Legal);
+    setOperationAction(ISD::VSETCC,             MVT::v16i8, Legal);
+    setOperationAction(ISD::VSETCC,             MVT::v8i16, Legal);
+    setOperationAction(ISD::VSETCC,             MVT::v4i32, Legal);
+    setOperationAction(ISD::VSETCC,             MVT::v2i64, Legal);
+
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
@@ -637,13 +659,14 @@
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
       // Do not attempt to custom lower non-power-of-2 vectors
-      if (!isPowerOf2_32(MVT::getVectorNumElements(VT)))
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
         continue;
-      setOperationAction(ISD::BUILD_VECTOR,        (MVT::ValueType)VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,      (MVT::ValueType)VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
@@ -658,16 +681,16 @@
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
-      setOperationAction(ISD::AND,    (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::AND,    (MVT::ValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::OR,     (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::OR,     (MVT::ValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::XOR,    (MVT::ValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::LOAD,   (MVT::ValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::AND,    (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::AND,    (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::OR,     (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR,    (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::LOAD,   (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
     }
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -677,11 +700,13 @@
     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+    
   }
   
   if (Subtarget->hasSSE41()) {
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -708,6 +733,7 @@
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::STORE);
 
@@ -715,16 +741,15 @@
 
   // FIXME: These should be based on subtarget info. Plus, the values should
   // be smaller when we are in optimizing for size mode.
-  maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores
-  maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores
-  maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores
+  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
   allowUnalignedMemoryAccesses = true; // x86 supports it!
   setPrefLoopAlignment(16);
 }
 
 
-MVT::ValueType
-X86TargetLowering::getSetCCResultType(const SDOperand &) const {
+MVT X86TargetLowering::getSetCCResultType(const SDOperand &) const {
   return MVT::i8;
 }
 
@@ -768,6 +793,23 @@
   return Align;
 }
 
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+/// determining it.
+MVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                       bool isSrcConst, bool isSrcStr) const {
+  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
+    return MVT::v4i32;
+  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
+    return MVT::v4f32;
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+
 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
 /// jumptable.
 SDOperand X86TargetLowering::getPICJumpTableRelocBase(SDOperand Table,
@@ -785,19 +827,6 @@
 
 #include "X86GenCallingConv.inc"
 
-/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it
-/// exists skip possible ISD:TokenFactor.
-static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) {
-  if (Chain.getOpcode() == X86ISD::TAILCALL) {
-    return Chain;
-  } else if (Chain.getOpcode() == ISD::TokenFactor) {
-    if (Chain.getNumOperands() &&
-        Chain.getOperand(0).getOpcode() == X86ISD::TAILCALL)
-      return Chain.getOperand(0);
-  }
-  return Chain;
-}
-
 /// LowerRET - Lower an ISD::RET node.
 SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
   assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
@@ -818,7 +847,7 @@
   SDOperand Chain = Op.getOperand(0);
   
   // Handle tail call return.
-  Chain = GetPossiblePreceedingTailCall(Chain);
+  Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
   if (Chain.getOpcode() == X86ISD::TAILCALL) {
     SDOperand TailCall = Chain;
     SDOperand TargetAddress = TailCall.getOperand(1);
@@ -871,10 +900,29 @@
       // Don't emit a copytoreg.
       continue;
     }
-    
+
     Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), ValToCopy, Flag);
     Flag = Chain.getValue(1);
   }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax.
+  if (Subtarget->is64Bit() &&
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDOperand Val = DAG.getCopyFromReg(Chain, Reg, getPointerTy());
+
+    Chain = DAG.getCopyToReg(Chain, X86::RAX, Val, Flag);
+    Flag = Chain.getValue(1);
+  }
   
   RetOps[0] = Chain;  // Update chain.
 
@@ -905,7 +953,7 @@
   
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    MVT::ValueType CopyVT = RVLocs[i].getValVT();
+    MVT CopyVT = RVLocs[i].getValVT();
     
     // If this is a call to a function that returns an fp value on the floating
     // point stack, but where we prefer to use the value in xmm registers, copy
@@ -930,11 +978,11 @@
     
     ResultVals.push_back(Val);
   }
-  
+
   // Merge everything together with a MERGE_VALUES node.
   ResultVals.push_back(Chain);
-  return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).Val;
+  return DAG.getMergeValues(TheCall->getVTList(), &ResultVals[0],
+                            ResultVals.size()).Val;
 }
 
 
@@ -1035,27 +1083,6 @@
   return None;
 }
 
-/// IsPossiblyOverwrittenArgumentOfTailCall - Check if the operand could
-/// possibly be overwritten when lowering the outgoing arguments in a tail
-/// call. Currently the implementation of this call is very conservative and
-/// assumes all arguments sourcing from FORMAL_ARGUMENTS or a CopyFromReg with
-/// virtual registers would be overwritten by direct lowering.
-static bool IsPossiblyOverwrittenArgumentOfTailCall(SDOperand Op, 
-                                                    MachineFrameInfo * MFI) {
-  RegisterSDNode * OpReg = NULL;
-  FrameIndexSDNode * FrameIdxNode = NULL;
-  int FrameIdx = 0;
-  if (Op.getOpcode() == ISD::FORMAL_ARGUMENTS ||
-      (Op.getOpcode()== ISD::CopyFromReg &&
-       (OpReg = dyn_cast<RegisterSDNode>(Op.getOperand(1))) &&
-       (OpReg->getReg() >= TargetRegisterInfo::FirstVirtualRegister)) ||
-      (Op.getOpcode() == ISD::LOAD &&
-       (FrameIdxNode = dyn_cast<FrameIndexSDNode>(Op.getOperand(1))) &&
-       (MFI->isFixedObjectIndex((FrameIdx = FrameIdxNode->getIndex()))) &&
-       (MFI->getObjectOffset(FrameIdx) >= 0)))
-    return true;
-  return false;
-}
 
 /// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
 /// in a register before calling.
@@ -1065,7 +1092,6 @@
     Subtarget->isPICStyleGOT();
 }
 
-
 /// CallRequiresFnAddressInReg - Check whether the call requires the function
 /// address to be loaded in a register.
 bool 
@@ -1075,33 +1101,6 @@
     Subtarget->isPICStyleGOT();
 }
 
-/// CopyTailCallClobberedArgumentsToVRegs - Create virtual registers for all
-/// arguments to force loading and guarantee that arguments sourcing from
-/// incomming parameters are not overwriting each other.
-static SDOperand 
-CopyTailCallClobberedArgumentsToVRegs(SDOperand Chain,
-     SmallVector<std::pair<unsigned, SDOperand>, 8> &TailCallClobberedVRegs,
-                                      SelectionDAG &DAG,
-                                      MachineFunction &MF,
-                                      const TargetLowering * TL) {
-      
-  SDOperand InFlag;
-  for (unsigned i = 0, e = TailCallClobberedVRegs.size(); i != e; i++) {
-    SDOperand Arg = TailCallClobberedVRegs[i].second;
-    unsigned Idx = TailCallClobberedVRegs[i].first;
-    unsigned VReg = 
-      MF.getRegInfo().
-      createVirtualRegister(TL->getRegClassFor(Arg.getValueType()));
-    Chain = DAG.getCopyToReg(Chain, VReg, Arg, InFlag);
-    InFlag = Chain.getValue(1);
-    Arg = DAG.getCopyFromReg(Chain, VReg, Arg.getValueType(), InFlag);
-    TailCallClobberedVRegs[i] = std::make_pair(Idx, Arg);
-    Chain = Arg.getValue(1);
-    InFlag = Arg.getValue(2);
-  }
-  return Chain;
-} 
-
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
 /// by "Src" to address "Dst" with size and alignment information specified by
 /// the specific parameter attribute. The copy will be passed as a byval
@@ -1111,8 +1110,7 @@
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
   SDOperand SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       /*AlwaysInline=*/true,
-                       NULL, 0, NULL, 0);
+                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
 }
 
 SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
@@ -1130,7 +1128,7 @@
   // changed with more analysis.  
   // In case of tail call optimization mark all arguments mutable. Since they
   // could be overwritten by lowering of arguments in case of a tail call.
-  int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
                                   VA.getLocMemOffset(), isImmutable);
   SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
   if (Flags.isByVal())
@@ -1158,6 +1156,7 @@
   bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
   unsigned CC = MF.getFunction()->getCallingConv();
   bool Is64Bit = Subtarget->is64Bit();
+  bool IsWin64 = Subtarget->isTargetWin64();
 
   assert(!(isVarArg && CC == CallingConv::Fast) &&
          "Var args not supported with calling convention fastcc");
@@ -1178,7 +1177,7 @@
     LastVal = VA.getValNo();
     
     if (VA.isRegLoc()) {
-      MVT::ValueType RegVT = VA.getLocVT();
+      MVT RegVT = VA.getLocVT();
       TargetRegisterClass *RC;
       if (RegVT == MVT::i32)
         RC = X86::GR32RegisterClass;
@@ -1188,13 +1187,25 @@
         RC = X86::FR32RegisterClass;
       else if (RegVT == MVT::f64)
         RC = X86::FR64RegisterClass;
-      else {
-        assert(MVT::isVector(RegVT));
-        if (Is64Bit && MVT::getSizeInBits(RegVT) == 64) {
-          RC = X86::GR64RegisterClass;       // MMX values are passed in GPRs.
-          RegVT = MVT::i64;
-        } else
-          RC = X86::VR128RegisterClass;
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+        RC = X86::VR128RegisterClass;
+      else if (RegVT.isVector()) {
+        assert(RegVT.getSizeInBits() == 64);
+        if (!Is64Bit)
+          RC = X86::VR64RegisterClass;     // MMX values are passed in MMXs.
+        else {
+          // Darwin calling convention passes MMX values in either GPRs or
+          // XMMs in x86-64. Other targets pass them in memory.
+          if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
+            RC = X86::VR128RegisterClass;  // MMX values are passed in XMMs.
+            RegVT = MVT::v2i64;
+          } else {
+            RC = X86::GR64RegisterClass;   // v1i64 values are passed in GPRs.
+            RegVT = MVT::i64;
+          }
+        }
+      } else {
+        assert(0 && "Unknown argument type!");
       }
 
       unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
@@ -1214,9 +1225,15 @@
         ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
       
       // Handle MMX values passed in GPRs.
-      if (Is64Bit && RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
-          MVT::getSizeInBits(RegVT) == 64)
-        ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+      if (Is64Bit && RegVT != VA.getLocVT()) {
+        if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+        else if (RC == X86::VR128RegisterClass) {
+          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i64, ArgValue,
+                                 DAG.getConstant(0, MVT::i64));
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+        }
+      }
       
       ArgValues.push_back(ArgValue);
     } else {
@@ -1225,6 +1242,21 @@
     }
   }
 
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDOperand Copy = DAG.getCopyToReg(DAG.getEntryNode(), Reg, ArgValues[0]);
+    Root = DAG.getNode(ISD::TokenFactor, MVT::Other, Copy, Root);
+  }
+
   unsigned StackSize = CCInfo.getNextStackOffset();
   // align stack specially for tail calls
   if (CC == CallingConv::Fast)
@@ -1237,30 +1269,52 @@
       VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
     }
     if (Is64Bit) {
-      static const unsigned GPR64ArgRegs[] = {
-        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9
+      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+      // FIXME: We should really autogenerate these arrays
+      static const unsigned GPR64ArgRegsWin64[] = {
+        X86::RCX, X86::RDX, X86::R8,  X86::R9
+      };
+      static const unsigned XMMArgRegsWin64[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
       };
-      static const unsigned XMMArgRegs[] = {
+      static const unsigned GPR64ArgRegs64Bit[] = {
+        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+      };
+      static const unsigned XMMArgRegs64Bit[] = {
         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
       };
-      
-      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
-      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
-    
+      const unsigned *GPR64ArgRegs, *XMMArgRegs;
+
+      if (IsWin64) {
+        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
+        GPR64ArgRegs = GPR64ArgRegsWin64;
+        XMMArgRegs = XMMArgRegsWin64;
+      } else {
+        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+        GPR64ArgRegs = GPR64ArgRegs64Bit;
+        XMMArgRegs = XMMArgRegs64Bit;
+      }
+      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+                                                       TotalNumIntRegs);
+      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
+                                                       TotalNumXMMRegs);
+
       // For X86-64, if there are vararg parameters that are passed via
       // registers, then we must store them to their spots on the stack so they
       // may be loaded by deferencing the result of va_next.
       VarArgsGPOffset = NumIntRegs * 8;
-      VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
-      RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
-      
+      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
+      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
+                                                 TotalNumXMMRegs * 16, 16);
+
       // Store the integer parameter registers.
       SmallVector<SDOperand, 8> MemOps;
       SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
       SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(VarArgsGPOffset));
-      for (; NumIntRegs != 6; ++NumIntRegs) {
+      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
         unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
                                   X86::GR64RegisterClass);
         SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
@@ -1272,11 +1326,11 @@
         FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                           DAG.getIntPtrConstant(8));
       }
-      
+
       // Now store the XMM (fp + vector) parameter registers.
       FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                         DAG.getIntPtrConstant(VarArgsFPOffset));
-      for (; NumXMMRegs != 8; ++NumXMMRegs) {
+      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
         unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
                                   X86::VR128RegisterClass);
         SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
@@ -1325,8 +1379,8 @@
   FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
 
   // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+  return DAG.getMergeValues(Op.Val->getVTList(), &ArgValues[0],
+                            ArgValues.size()).getValue(Op.ResNo);
 }
 
 SDOperand
@@ -1359,7 +1413,7 @@
   if (!IsTailCall || FPDiff==0) return Chain;
 
   // Adjust the Return address stack slot.
-  MVT::ValueType VT = getPointerTy();
+  MVT VT = getPointerTy();
   OutRetAddr = getReturnAddressFrameIndex(DAG);
   // Load the "old" Return address.
   OutRetAddr = DAG.getLoad(VT, Chain,OutRetAddr, NULL, 0);
@@ -1378,72 +1432,15 @@
   int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI = 
     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
-  MVT::ValueType VT = Is64Bit ? MVT::i64 : MVT::i32;
+  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   Chain = DAG.getStore(Chain, RetAddrFrIdx, NewRetAddrFrIdx, 
                        PseudoSourceValue::getFixedStack(), NewReturnAddrFI);
   return Chain;
 }
 
-/// CopyTailCallByValClobberedRegToVirtReg - Copy arguments with register target
-/// which might be overwritten by later byval tail call lowering to a virtual
-/// register.
-bool
-X86TargetLowering::CopyTailCallByValClobberedRegToVirtReg(bool containsByValArg,
-    SmallVector< std::pair<unsigned, unsigned>, 8> &TailCallByValClobberedVRegs,
-    SmallVector<MVT::ValueType, 8> &TailCallByValClobberedVRegTypes,
-    std::pair<unsigned, SDOperand> &RegToPass,
-    SDOperand &OutChain,
-    SDOperand &OutFlag,
-    MachineFunction &MF,
-    SelectionDAG & DAG) {
-  if (!containsByValArg) return false;
-
-  std::pair<unsigned, unsigned> ArgRegVReg;
-  MVT::ValueType VT = RegToPass.second.getValueType();
-  
-  ArgRegVReg.first = RegToPass.first;
-  ArgRegVReg.second = MF.getRegInfo().createVirtualRegister(getRegClassFor(VT));
-  
-  // Copy Argument to virtual register.
-  OutChain = DAG.getCopyToReg(OutChain, ArgRegVReg.second,
-                              RegToPass.second, OutFlag);
-  OutFlag = OutChain.getValue(1);
-  // Remember virtual register and type.
-  TailCallByValClobberedVRegs.push_back(ArgRegVReg);
-  TailCallByValClobberedVRegTypes.push_back(VT);
-  return true;
-}
-
-
-/// RestoreTailCallByValClobberedReg - Restore registers which were saved to
-/// virtual registers to prevent tail call byval lowering from overwriting
-/// parameter registers.
-static SDOperand 
-RestoreTailCallByValClobberedRegs(SelectionDAG & DAG, SDOperand Chain,
-    SmallVector< std::pair<unsigned, unsigned>, 8> &TailCallByValClobberedVRegs,
-    SmallVector<MVT::ValueType, 8> &TailCallByValClobberedVRegTypes) {
-  if (TailCallByValClobberedVRegs.size()==0) return Chain;
-  
-  SmallVector<SDOperand, 8> RegOpChains;
-  for (unsigned i = 0, e=TailCallByValClobberedVRegs.size(); i != e; i++) {
-    SDOperand InFlag;
-    unsigned DestReg = TailCallByValClobberedVRegs[i].first;
-    unsigned VirtReg = TailCallByValClobberedVRegs[i].second;
-    MVT::ValueType VT = TailCallByValClobberedVRegTypes[i];
-    SDOperand Tmp = DAG.getCopyFromReg(Chain, VirtReg, VT, InFlag);
-    Chain = DAG.getCopyToReg(Chain, DestReg, Tmp, InFlag);
-    RegOpChains.push_back(Chain);
-  }
-  if (!RegOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
-                        &RegOpChains[0], RegOpChains.size());
-  return Chain;
-}
-
 SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo * MFI = MF.getFrameInfo();
   SDOperand Chain     = Op.getOperand(0);
   unsigned CC         = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
   bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
@@ -1494,18 +1491,11 @@
                                   FPDiff);
 
   SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
-  SmallVector<std::pair<unsigned, SDOperand>, 8> TailCallClobberedVRegs;
-  
   SmallVector<SDOperand, 8> MemOpChains;
-
   SDOperand StackPtr;
-  bool containsTailCallByValArg = false;
-  SmallVector<std::pair<unsigned, unsigned>, 8> TailCallByValClobberedVRegs;
-  SmallVector<MVT::ValueType, 8> TailCallByValClobberedVRegTypes;
-  
 
-  // Walk the register/memloc assignments, inserting copies/loads.  For tail
-  // calls, remember all arguments for later special lowering.
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
@@ -1528,6 +1518,30 @@
     }
     
     if (VA.isRegLoc()) {
+      if (Is64Bit) {
+        MVT RegVT = VA.getLocVT();
+        if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+          switch (VA.getLocReg()) {
+          default:
+            break;
+          case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
+          case X86::R8: {
+            // Special case: passing MMX values in GPR registers.
+            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
+            break;
+          }
+          case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
+          case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
+            // Special case: passing MMX values in XMM registers.
+            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
+            Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Arg);
+            Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
+                              DAG.getNode(ISD::UNDEF, MVT::v2i64), Arg,
+                              getMOVLMask(2, DAG));
+            break;
+          }
+          }
+      }
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       if (!IsTailCall || (IsTailCall && isByVal)) {
@@ -1537,10 +1551,6 @@
         
         MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
                                                Arg));
-        // Remember fact that this call contains byval arguments.
-        containsTailCallByValArg |= IsTailCall && isByVal;
-      } else if (IsPossiblyOverwrittenArgumentOfTailCall(Arg, MFI)) {
-        TailCallClobberedVRegs.push_back(std::make_pair(i,Arg));
       }
     }
   }
@@ -1552,21 +1562,14 @@
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into registers.
   SDOperand InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    // Tail call byval lowering might overwrite argument registers so arguments
-    // passed to be copied to a virtual register for
-    // later processing.
-    if (CopyTailCallByValClobberedRegToVirtReg(containsTailCallByValArg,
-                                               TailCallByValClobberedVRegs,
-                                               TailCallByValClobberedVRegTypes,
-                                               RegsToPass[i], Chain, InFlag, MF,
-                                               DAG))
-      continue;
-     
-    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
-                             InFlag);
-    InFlag = Chain.getValue(1);
-  }
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!IsTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    }
 
   // ELF / PIC requires GOT in the EBX register before function calls via PLT
   // GOT pointer.  
@@ -1600,7 +1603,8 @@
     // of SSE registers used. The contents of %al do not need to match exactly
     // the number of registers, but must be an ubound on the number of SSE
     // registers used and is in the range 0 - 8 inclusive.
-    
+
+    // FIXME: Verify this on Win64
     // Count the number of XMM registers allocated.
     static const unsigned XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -1621,10 +1625,6 @@
     int FI = 0;
     // Do not flag preceeding copytoreg stuff together with the following stuff.
     InFlag = SDOperand();
-    
-    Chain = CopyTailCallClobberedArgumentsToVRegs(Chain, TailCallClobberedVRegs,
-                                                  DAG, MF, this);
- 
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
       CCValAssign &VA = ArgLocs[i];
       if (!VA.isRegLoc()) {
@@ -1635,21 +1635,10 @@
           cast<ARG_FLAGSSDNode>(FlagsOp)->getArgFlags();
         // Create frame index.
         int32_t Offset = VA.getLocMemOffset()+FPDiff;
-        uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
+        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
         FIN = DAG.getFrameIndex(FI, getPointerTy());
 
-        // Find virtual register for this argument.
-        bool Found=false;
-        for (unsigned idx=0, e= TailCallClobberedVRegs.size(); idx < e; idx++)
-          if (TailCallClobberedVRegs[idx].first==i) {
-            Arg = TailCallClobberedVRegs[idx].second;
-            Found=true;
-            break;
-          }
-        assert(IsPossiblyOverwrittenArgumentOfTailCall(Arg, MFI)==false || 
-          (Found==true && "No corresponding Argument was found"));
-
         if (Flags.isByVal()) {
           // Copy relative to framepointer.
           SDOperand Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
@@ -1672,10 +1661,13 @@
       Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                           &MemOpChains2[0], MemOpChains2.size());
 
-    // Restore byval lowering clobbered registers.
-    Chain = RestoreTailCallByValClobberedRegs(DAG, Chain,
-                                             TailCallByValClobberedVRegs,
-                                             TailCallByValClobberedVRegTypes);
+    // Copy arguments to their registers.
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDOperand();
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
@@ -1853,15 +1845,7 @@
   if (!PerformTailCallOpt)
     return false;
 
-  // Check whether CALL node immediatly preceeds the RET node and whether the
-  // return uses the result of the node or is a void return.
-  unsigned NumOps = Ret.getNumOperands();
-  if ((NumOps == 1 && 
-       (Ret.getOperand(0) == SDOperand(Call.Val,1) ||
-        Ret.getOperand(0) == SDOperand(Call.Val,0))) ||
-      (NumOps > 1 &&
-       Ret.getOperand(0) == SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
-       Ret.getOperand(1) == SDOperand(Call.Val,0))) {
+  if (CheckTailCallReturnConstraints(Call, Ret)) {
     MachineFunction &MF = DAG.getMachineFunction();
     unsigned CallerCC = MF.getFunction()->getCallingConv();
     unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
@@ -2588,9 +2572,9 @@
 static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
                                       SDOperand &V2, SDOperand &Mask,
                                       SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType MaskVT = Mask.getValueType();
-  MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT);
+  MVT VT = Op.getValueType();
+  MVT MaskVT = Mask.getValueType();
+  MVT EltVT = MaskVT.getVectorElementType();
   unsigned NumElems = Mask.getNumOperands();
   SmallVector<SDOperand, 8> MaskVec;
 
@@ -2617,8 +2601,8 @@
 /// the two vector operands have swapped position.
 static
 SDOperand CommuteVectorShuffleMask(SDOperand Mask, SelectionDAG &DAG) {
-  MVT::ValueType MaskVT = Mask.getValueType();
-  MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT);
+  MVT MaskVT = Mask.getValueType();
+  MVT EltVT = MaskVT.getVectorElementType();
   unsigned NumElems = Mask.getNumOperands();
   SmallVector<SDOperand, 8> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i) {
@@ -2656,11 +2640,16 @@
 }
 
 /// isScalarLoadToVector - Returns true if the node is a scalar load that
-/// is promoted to a vector.
-static inline bool isScalarLoadToVector(SDNode *N) {
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
   if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
     N = N->getOperand(0).Val;
-    return ISD::isNON_EXTLoad(N);
+    if (ISD::isNON_EXTLoad(N)) {
+      if (LD)
+        *LD = cast<LoadSDNode>(N);
+      return true;
+    }
   }
   return false;
 }
@@ -2772,30 +2761,35 @@
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
-  assert(MVT::isVector(VT) && "Expected a vector type");
+static SDOperand getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG) {
+  assert(VT.isVector() && "Expected a vector type");
   
   // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
   // type.  This ensures they get CSE'd.
-  SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
   SDOperand Vec;
-  if (MVT::getSizeInBits(VT) == 64)  // MMX
+  if (VT.getSizeInBits() == 64) { // MMX
+    SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
     Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
-  else                                              // SSE
+  } else if (HasSSE2) {  // SSE2
+    SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
     Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else { // SSE1
+    SDOperand Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4f32, Cst, Cst, Cst, Cst);
+  }
   return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
 ///
-static SDOperand getOnesVector(MVT::ValueType VT, SelectionDAG &DAG) {
-  assert(MVT::isVector(VT) && "Expected a vector type");
+static SDOperand getOnesVector(MVT VT, SelectionDAG &DAG) {
+  assert(VT.isVector() && "Expected a vector type");
   
   // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
   // type.  This ensures they get CSE'd.
   SDOperand Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDOperand Vec;
-  if (MVT::getSizeInBits(VT) == 64)  // MMX
+  if (VT.getSizeInBits() == 64)  // MMX
     Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
   else                                              // SSE
     Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -2832,8 +2826,8 @@
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
 static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG) {
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT BaseVT = MaskVT.getVectorElementType();
 
   SmallVector<SDOperand, 8> MaskVec;
   MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
@@ -2845,8 +2839,8 @@
 /// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
 /// of specified width.
 static SDOperand getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) {
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT BaseVT = MaskVT.getVectorElementType();
   SmallVector<SDOperand, 8> MaskVec;
   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
     MaskVec.push_back(DAG.getConstant(i,            BaseVT));
@@ -2858,8 +2852,8 @@
 /// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
 /// of specified width.
 static SDOperand getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) {
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT BaseVT = MaskVT.getVectorElementType();
   unsigned Half = NumElems/2;
   SmallVector<SDOperand, 8> MaskVec;
   for (unsigned i = 0; i != Half; ++i) {
@@ -2874,8 +2868,8 @@
 /// elements in place.
 static SDOperand getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
                                    SelectionDAG &DAG) {
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT BaseVT = MaskVT.getVectorElementType();
   SmallVector<SDOperand, 8> MaskVec;
   // Element #0 of the result gets the elt we are replacing.
   MaskVec.push_back(DAG.getConstant(DestElt, BaseVT));
@@ -2886,8 +2880,8 @@
 
 /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
 static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG, bool HasSSE2) {
-  MVT::ValueType PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
-  MVT::ValueType VT = Op.getValueType();
+  MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
+  MVT VT = Op.getValueType();
   if (PVT == VT)
     return Op;
   SDOperand V1 = Op.getOperand(0);
@@ -2900,7 +2894,7 @@
       V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
       NumElems >>= 1;
     }
-    Mask = getZeroVector(MVT::v4i32, DAG);
+    Mask = getZeroVector(MVT::v4i32, true, DAG);
   }
 
   V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
@@ -2914,12 +2908,14 @@
 /// element of V2 is swizzled into the zero/undef vector, landing at element
 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, unsigned Idx,
-                                             bool isZero, SelectionDAG &DAG) {
-  MVT::ValueType VT = V2.getValueType();
-  SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
-  unsigned NumElems = MVT::getVectorNumElements(V2.getValueType());
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-  MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+                                             bool isZero, bool HasSSE2,
+                                             SelectionDAG &DAG) {
+  MVT VT = V2.getValueType();
+  SDOperand V1 = isZero
+    ? getZeroVector(VT, HasSSE2, DAG) : DAG.getNode(ISD::UNDEF, VT);
+  unsigned NumElems = V2.getValueType().getVectorNumElements();
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT EVT = MaskVT.getVectorElementType();
   SmallVector<SDOperand, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
     if (i == Idx)  // If this is the insertion idx, put the low elt of V2 here.
@@ -2931,6 +2927,70 @@
   return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
 }
 
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(SDOperand Op, SDOperand Mask,
+                                  unsigned NumElems, bool Low,
+                                  SelectionDAG &DAG) {
+  unsigned NumZeros = 0;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    unsigned Index = Low ? i : NumElems-i-1;
+    SDOperand Idx = Mask.getOperand(Index);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      ++NumZeros;
+      continue;
+    }
+    SDOperand Elt = DAG.getShuffleScalarElt(Op.Val, Index);
+    if (Elt.Val && isZeroNode(Elt))
+      ++NumZeros;
+    else
+      break;
+  }
+  return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(SDOperand Op, SDOperand Mask, SelectionDAG &DAG,
+                          bool &isLeft, SDOperand &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = Mask.getNumOperands();
+
+  isLeft = true;
+  unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
+  if (!NumZeros) {
+    isLeft = false;
+    NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
+    if (!NumZeros)
+      return false;
+  }
+
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+  for (unsigned i = NumZeros; i < NumElems; ++i) {
+    unsigned Val = isLeft ? (i - NumZeros) : i;
+    SDOperand Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
+    if (Idx.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
+    if (Index < NumElems)
+      SeenV1 = true;
+    else {
+      Index -= NumElems;
+      SeenV2 = true;
+    }
+    if (Index != Val)
+      return false;
+  }
+  if (SeenV1 && SeenV2)
+    return false;
+
+  ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
+  ShAmt = NumZeros;
+  return true;
+}
+
+
 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
 ///
 static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
@@ -2945,7 +3005,7 @@
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
     if (ThisIsNonZero && First) {
       if (NumZero)
-        V = getZeroVector(MVT::v8i16, DAG);
+        V = getZeroVector(MVT::v8i16, true, DAG);
       else
         V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
       First = false;
@@ -2990,7 +3050,7 @@
     if (isNonZero) {
       if (First) {
         if (NumZero)
-          V = getZeroVector(MVT::v8i16, DAG);
+          V = getZeroVector(MVT::v8i16, true, DAG);
         else
           V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
         First = false;
@@ -3003,6 +3063,20 @@
   return V;
 }
 
+/// getVShift - Return a vector logical shift node.
+///
+static SDOperand getVShift(bool isLeft, MVT VT, SDOperand SrcOp,
+                           unsigned NumBits, SelectionDAG &DAG,
+                           const TargetLowering &TLI) {
+  bool isMMX = VT.getSizeInBits() == 64;
+  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp);
+  return DAG.getNode(ISD::BIT_CONVERT, VT,
+                     DAG.getNode(Opc, ShVT, SrcOp,
+                              DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
 SDOperand
 X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
   // All zero's are handled with pxor, all one's are handled with pcmpeqd.
@@ -3015,12 +3089,12 @@
 
     if (ISD::isBuildVectorAllOnes(Op.Val))
       return getOnesVector(Op.getValueType(), DAG);
-    return getZeroVector(Op.getValueType(), DAG);
+    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG);
   }
 
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
-  unsigned EVTBits = MVT::getSizeInBits(EVT);
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned EVTBits = EVT.getSizeInBits();
 
   unsigned NumElems = Op.getNumOperands();
   unsigned NumZero  = 0;
@@ -3063,14 +3137,15 @@
         (!IsAllConstants || Idx == 0)) {
       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
         // Handle MMX and SSE both.
-        MVT::ValueType VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
-        MVT::ValueType VecElts = VT == MVT::v2i64 ? 4 : 2;
+        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
         
         // Truncate the value (which may itself be a constant) to i32, and
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VecVT, Item);
-        Item = getShuffleVectorZeroOrUndef(Item, 0, true, DAG);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
         
         // Now we have our 32-bit value zero extended in the low element of
         // a vector.  If Idx != 0, swizzle it into place.
@@ -3095,7 +3170,17 @@
         (EVT != MVT::i64 || Subtarget->is64Bit())) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
       // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, DAG);
+      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+                                         Subtarget->hasSSE2(), DAG);
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this);
     }
     
     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
@@ -3110,9 +3195,10 @@
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
       
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
-      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, DAG);
-      MVT::ValueType MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
-      MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+                                         Subtarget->hasSSE2(), DAG);
+      MVT MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
+      MVT MaskEVT = MaskVT.getVectorElementType();
       SmallVector<SDOperand, 8> MaskVec;
       for (unsigned i = 0; i < NumElems; i++)
         MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
@@ -3133,8 +3219,17 @@
     return SDOperand();
 
   // Let legalizer expand 2-wide build_vectors.
-  if (EVTBits == 64)
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDOperand V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true,
+                                         Subtarget->hasSSE2(), DAG);
+    }
     return SDOperand();
+  }
 
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16) {
@@ -3156,7 +3251,7 @@
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1 << i));
       if (isZero)
-        V[i] = getZeroVector(VT, DAG);
+        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG);
       else
         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
     }
@@ -3182,15 +3277,8 @@
       }
     }
 
-    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
-    // clears the upper bits.
-    // FIXME: we can do the same for v4f32 case when we know both parts of
-    // the lower half come from scalar_to_vector (loadf32). We should do
-    // that in post legalizer dag combiner with target specific hooks.
-    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
-      return V[0];
-    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
-    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+    MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+    MVT EVT = MaskVT.getVectorElementType();
     SmallVector<SDOperand, 8> MaskVec;
     bool Reverse = (NonZeros & 0x3) == 2;
     for (unsigned i = 0; i < 2; ++i)
@@ -3236,9 +3324,9 @@
                                    SDOperand PermMask, SelectionDAG &DAG,
                                    TargetLowering &TLI) {
   SDOperand NewV;
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(8);
-  MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
-  MVT::ValueType PtrVT = TLI.getPointerTy();
+  MVT MaskVT = MVT::getIntVectorWithNumElements(8);
+  MVT MaskEVT = MaskVT.getVectorElementType();
+  MVT PtrVT = TLI.getPointerTy();
   SmallVector<SDOperand, 8> MaskElts(PermMask.Val->op_begin(),
                                      PermMask.Val->op_end());
 
@@ -3362,8 +3450,6 @@
         continue;
       SDOperand Elt = MaskElts[i];
       unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
-      if (EltIdx == i)
-        continue;
       SDOperand ExtOp = (EltIdx < 8)
         ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
                       DAG.getConstant(EltIdx, PtrVT))
@@ -3478,23 +3564,23 @@
 /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
 static
 SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2,
-                                MVT::ValueType VT,
+                                MVT VT,
                                 SDOperand PermMask, SelectionDAG &DAG,
                                 TargetLowering &TLI) {
   unsigned NumElems = PermMask.getNumOperands();
   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
-  MVT::ValueType NewVT = MaskVT;
-  switch (VT) {
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+  MVT NewVT = MaskVT;
+  switch (VT.getSimpleVT()) {
+  default: assert(false && "Unexpected!");
   case MVT::v4f32: NewVT = MVT::v2f64; break;
   case MVT::v4i32: NewVT = MVT::v2i64; break;
   case MVT::v8i16: NewVT = MVT::v4i32; break;
   case MVT::v16i8: NewVT = MVT::v4i32; break;
-  default: assert(false && "Unexpected!");
   }
 
   if (NewWidth == 2) {
-    if (MVT::isInteger(VT))
+    if (VT.isInteger())
       NewVT = MVT::v2i64;
     else
       NewVT = MVT::v2f64;
@@ -3526,14 +3612,46 @@
                                  &MaskVec[0], MaskVec.size()));
 }
 
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDOperand getVZextMovL(MVT VT, MVT OpVT,
+                              SDOperand SrcOp, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = NULL;
+    if (!isScalarLoadToVector(SrcOp.Val, &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BIT_CONVERT, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT,
+                                                   SrcOp.getOperand(0).getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, OpVT,
+                                 DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp)));
+}
+
 SDOperand
 X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
   SDOperand V1 = Op.getOperand(0);
   SDOperand V2 = Op.getOperand(1);
   SDOperand PermMask = Op.getOperand(2);
-  MVT::ValueType VT = Op.getValueType();
+  MVT VT = Op.getValueType();
   unsigned NumElems = PermMask.getNumOperands();
-  bool isMMX = MVT::getSizeInBits(VT) == 64;
+  bool isMMX = VT.getSizeInBits() == 64;
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
@@ -3543,7 +3661,7 @@
     return DAG.getNode(ISD::UNDEF, VT);
 
   if (isZeroShuffle(Op.Val))
-    return getZeroVector(VT, DAG);
+    return getZeroVector(VT, Subtarget->hasSSE2(), DAG);
 
   if (isIdentityMask(PermMask.Val))
     return V1;
@@ -3566,27 +3684,46 @@
     // FIXME: Figure out a cleaner way to do this.
     // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.Val)) {
-      SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
+      SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
+                                                 DAG, *this);
       if (NewOp.Val) {
         SDOperand NewV1 = NewOp.getOperand(0);
         SDOperand NewV2 = NewOp.getOperand(1);
         SDOperand NewMask = NewOp.getOperand(2);
         if (isCommutedMOVL(NewMask.Val, true, false)) {
           NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
-          NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(),
-                              NewV1, NewV2, getMOVLMask(2, DAG));
-          return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+          return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget);
         }
       }
     } else if (ISD::isBuildVectorAllZeros(V1.Val)) {
-      SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
+      SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
+                                                DAG, *this);
       if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val))
-        return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+                             DAG, Subtarget);
     }
   }
 
-  if (X86::isMOVLMask(PermMask.Val))
-    return (V1IsUndef) ? V2 : Op;
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDOperand ShVal;
+  bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use 
+    // v_set0 + movlhps or movhlps, etc.
+    MVT EVT = VT.getVectorElementType();
+    ShAmt *= EVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
+  }
+
+  if (X86::isMOVLMask(PermMask.Val)) {
+    if (V1IsUndef)
+      return V2;
+    if (ISD::isBuildVectorAllZeros(V1.Val))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget);
+    return Op;
+  }
 
   if (X86::isMOVSHDUPMask(PermMask.Val) ||
       X86::isMOVSLDUPMask(PermMask.Val) ||
@@ -3599,6 +3736,13 @@
       ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
     return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
 
+  if (isShift) {
+    // No better options. Use a vshl / vsrl.
+    MVT EVT = VT.getVectorElementType();
+    ShAmt *= EVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
+  }
+
   bool Commuted = false;
   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   // 1,1,1,1 -> v8i16 though.
@@ -3679,7 +3823,7 @@
         (X86::isPSHUFDMask(PermMask.Val) ||
          X86::isPSHUFHWMask(PermMask.Val) ||
          X86::isPSHUFLWMask(PermMask.Val))) {
-      MVT::ValueType RVT = VT;
+      MVT RVT = VT;
       if (VT == MVT::v4f32) {
         RVT = MVT::v4i32;
         Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT,
@@ -3709,8 +3853,8 @@
   // Handle all 4 wide cases with a number of shuffles.
   if (NumElems == 4 && !isMMX) {
     // Don't do this for MMX.
-    MVT::ValueType MaskVT = PermMask.getValueType();
-    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+    MVT MaskVT = PermMask.getValueType();
+    MVT MaskEVT = MaskVT.getVectorElementType();
     SmallVector<std::pair<int, int>, 8> Locs;
     Locs.reserve(NumElems);
     SmallVector<SDOperand, 8> Mask1(NumElems,
@@ -3817,14 +3961,14 @@
 SDOperand
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op,
                                                 SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  if (MVT::getSizeInBits(VT) == 8) {
+  MVT VT = Op.getValueType();
+  if (VT.getSizeInBits() == 8) {
     SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32,
                                     Op.getOperand(0), Op.getOperand(1));
     SDOperand Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, VT, Assert);
-  } else if (MVT::getSizeInBits(VT) == 16) {
+  } else if (VT.getSizeInBits() == 16) {
     SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32,
                                     Op.getOperand(0), Op.getOperand(1));
     SDOperand Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
@@ -3861,9 +4005,9 @@
       return Res;
   }
 
-  MVT::ValueType VT = Op.getValueType();
+  MVT VT = Op.getValueType();
   // TODO: handle v16i8.
-  if (MVT::getSizeInBits(VT) == 16) {
+  if (VT.getSizeInBits() == 16) {
     SDOperand Vec = Op.getOperand(0);
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
     if (Idx == 0)
@@ -3872,27 +4016,27 @@
                                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec),
                                      Op.getOperand(1)));
     // Transform it so it match pextrw which produces a 32-bit result.
-    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
+    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
     SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
                                     Op.getOperand(0), Op.getOperand(1));
     SDOperand Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, VT, Assert);
-  } else if (MVT::getSizeInBits(VT) == 32) {
+  } else if (VT.getSizeInBits() == 32) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
     if (Idx == 0)
       return Op;
     // SHUFPS the element to the lowest double word, then movss.
-    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    MVT MaskVT = MVT::getIntVectorWithNumElements(4);
     SmallVector<SDOperand, 8> IdxVec;
     IdxVec.
-      push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
+      push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType()));
     IdxVec.
-      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
     IdxVec.
-      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
     IdxVec.
-      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
     SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                  &IdxVec[0], IdxVec.size());
     SDOperand Vec = Op.getOperand(0);
@@ -3900,7 +4044,7 @@
                       Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                        DAG.getIntPtrConstant(0));
-  } else if (MVT::getSizeInBits(VT) == 64) {
+  } else if (VT.getSizeInBits() == 64) {
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
@@ -3911,11 +4055,11 @@
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
-    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    MVT MaskVT = MVT::getIntVectorWithNumElements(4);
     SmallVector<SDOperand, 8> IdxVec;
-    IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType()));
     IdxVec.
-      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
     SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                  &IdxVec[0], IdxVec.size());
     SDOperand Vec = Op.getOperand(0);
@@ -3930,15 +4074,15 @@
 
 SDOperand
 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG){
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
 
   SDOperand N0 = Op.getOperand(0);
   SDOperand N1 = Op.getOperand(1);
   SDOperand N2 = Op.getOperand(2);
 
-  if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == 16)) {
-    unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB
+  if ((EVT.getSizeInBits() == 8) || (EVT.getSizeInBits() == 16)) {
+    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
                                                   : X86ISD::PINSRW;
     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
     // argument.
@@ -3964,8 +4108,8 @@
 
 SDOperand
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
 
   if (Subtarget->hasSSE41())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
@@ -3977,7 +4121,7 @@
   SDOperand N1 = Op.getOperand(1);
   SDOperand N2 = Op.getOperand(2);
 
-  if (MVT::getSizeInBits(EVT) == 16) {
+  if (EVT.getSizeInBits() == 16) {
     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
     // as its second argument.
     if (N1.getValueType() != MVT::i32)
@@ -3992,8 +4136,8 @@
 SDOperand
 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
   SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
-  MVT::ValueType VT = MVT::v2i32;
-  switch (Op.getValueType()) {
+  MVT VT = MVT::v2i32;
+  switch (Op.getValueType().getSimpleVT()) {
   default: break;
   case MVT::v16i8:
   case MVT::v8i16:
@@ -4032,9 +4176,6 @@
 X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
   GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
-  // If it's a debug information descriptor, don't mess with it.
-  if (DAG.isVerifiedDebugInfoDesc(Op))
-    return Result;
   Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
   // With PIC, the address is actually $g + Offset.
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
@@ -4056,10 +4197,10 @@
   return Result;
 }
 
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
 static SDOperand
-LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                              const MVT::ValueType PtrVT) {
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const MVT PtrVT) {
   SDOperand InFlag;
   SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
                                      DAG.getNode(X86ISD::GlobalBaseReg,
@@ -4094,11 +4235,43 @@
   return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
 }
 
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDOperand
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const MVT PtrVT) {
+  SDOperand InFlag, Chain;
+
+  // emit leaq symbol at TLSGD(%rip), %rdi
+  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
+  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDOperand Ops[]  = { DAG.getEntryNode(), TGA};
+  SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 2);
+  Chain  = Result.getValue(1);
+  InFlag = Result.getValue(2);
+
+  // call ___tls_get_addr. This function receives its argument in
+  // the register RDI.
+  Chain = DAG.getCopyToReg(Chain, X86::RDI, Result, InFlag);
+  InFlag = Chain.getValue(1);
+
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops1[] = { Chain,
+                      DAG.getTargetExternalSymbol("___tls_get_addr",
+                                                  PtrVT),
+                      DAG.getRegister(X86::RDI, PtrVT),
+                      InFlag };
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 4);
+  InFlag = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, X86::RAX, PtrVT, InFlag);
+}
+
 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
 // "local exec" model.
-static SDOperand
-LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                         const MVT::ValueType PtrVT) {
+static SDOperand LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                     const MVT PtrVT) {
   // Get the Thread Pointer
   SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
   // emit "addl x at ntpoff,%eax" (local exec) or "addl x at indntpoff,%eax" (initial
@@ -4121,15 +4294,19 @@
 X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
   // TODO: implement the "local dynamic" model
   // TODO: implement the "initial exec"model for pic executables
-  assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() &&
-         "TLS not implemented for non-ELF and 64-bit targets");
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   // If the relocation model is PIC, use the "General Dynamic" TLS Model,
   // otherwise use the "Local Exec"TLS Model
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
-    return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy());
-  else
-    return LowerToTLSExecModel(GA, DAG, getPointerTy());
+  if (Subtarget->is64Bit()) {
+    return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+  } else {
+    if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+      return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+    else
+      return LowerToTLSExecModel(GA, DAG, getPointerTy());
+  }
 }
 
 SDOperand
@@ -4167,8 +4344,8 @@
 /// take a 2 x i32 value to shift plus a shift amount. 
 SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  MVT::ValueType VT = Op.getValueType();
-  unsigned VTBits = MVT::getSizeInBits(VT);
+  MVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDOperand ShOpLo = Op.getOperand(0);
   SDOperand ShOpHi = Op.getOperand(1);
@@ -4186,7 +4363,6 @@
     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, VT, ShOpHi, ShAmt);
   }
 
-  const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
   SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
                                   DAG.getConstant(VTBits, MVT::i8));
   SDOperand Cond = DAG.getNode(X86ISD::CMP, VT,
@@ -4194,46 +4370,24 @@
 
   SDOperand Hi, Lo;
   SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-  VTs = DAG.getNodeValueTypes(VT, MVT::Flag);
-  SmallVector<SDOperand, 4> Ops;
-  if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Ops.push_back(Tmp2);
-    Ops.push_back(Tmp3);
-    Ops.push_back(CC);
-    Ops.push_back(Cond);
-    Hi = DAG.getNode(X86ISD::CMOV, VT, &Ops[0], Ops.size());
+  SDOperand Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDOperand Ops1[4] = { Tmp3, Tmp1, CC, Cond };
 
-    Ops.clear();
-    Ops.push_back(Tmp3);
-    Ops.push_back(Tmp1);
-    Ops.push_back(CC);
-    Ops.push_back(Cond);
-    Lo = DAG.getNode(X86ISD::CMOV, VT, &Ops[0], Ops.size());
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4);
   } else {
-    Ops.push_back(Tmp2);
-    Ops.push_back(Tmp3);
-    Ops.push_back(CC);
-    Ops.push_back(Cond);
-    Lo = DAG.getNode(X86ISD::CMOV, VT, &Ops[0], Ops.size());
-
-    Ops.clear();
-    Ops.push_back(Tmp3);
-    Ops.push_back(Tmp1);
-    Ops.push_back(CC);
-    Ops.push_back(Cond);
-    Hi = DAG.getNode(X86ISD::CMOV, VT, &Ops[0], Ops.size());
+    Lo = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4);
   }
 
-  VTs = DAG.getNodeValueTypes(VT, VT);
-  Ops.clear();
-  Ops.push_back(Lo);
-  Ops.push_back(Hi);
-  return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size());
+  SDOperand Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2);
 }
 
 SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType SrcVT = Op.getOperand(0).getValueType();
-  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+  MVT SrcVT = Op.getOperand(0).getValueType();
+  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
   
   // These are really Legal; caller falls through into that case.
@@ -4243,7 +4397,7 @@
       Subtarget->is64Bit())
     return SDOperand();
   
-  unsigned Size = MVT::getSizeInBits(SrcVT)/8;
+  unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
   SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -4293,7 +4447,8 @@
 
 std::pair<SDOperand,SDOperand> X86TargetLowering::
 FP_TO_SINTHelper(SDOperand Op, SelectionDAG &DAG) {
-  assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 &&
+  assert(Op.getValueType().getSimpleVT() <= MVT::i64 &&
+         Op.getValueType().getSimpleVT() >= MVT::i16 &&
          "Unknown FP_TO_SINT to lower!");
 
   // These are really Legal.
@@ -4308,11 +4463,11 @@
   // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
   // stack slot.
   MachineFunction &MF = DAG.getMachineFunction();
-  unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8;
+  unsigned MemSize = Op.getValueType().getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
   SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   unsigned Opc;
-  switch (Op.getValueType()) {
+  switch (Op.getValueType().getSimpleVT()) {
   default: assert(0 && "Invalid FP_TO_SINT to lower!");
   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
@@ -4355,19 +4510,23 @@
   std::pair<SDOperand,SDOperand> Vals = FP_TO_SINTHelper(SDOperand(N, 0), DAG);
   SDOperand FIST = Vals.first, StackSlot = Vals.second;
   if (FIST.Val == 0) return 0;
-  
-  // Return an i64 load from the stack slot.
-  SDOperand Res = DAG.getLoad(MVT::i64, FIST, StackSlot, NULL, 0);
 
-  // Use a MERGE_VALUES node to drop the chain result value.
-  return DAG.getNode(ISD::MERGE_VALUES, MVT::i64, Res).Val;
-}  
+  MVT VT = N->getValueType(0);
+
+  // Return a load from the stack slot.
+  SDOperand Res = DAG.getLoad(VT, FIST, StackSlot, NULL, 0);
+
+  // Use MERGE_VALUES to drop the chain result value and get a node with one
+  // result.  This requires turning off getMergeValues simplification, since
+  // otherwise it will give us Res back.
+  return DAG.getMergeValues(&Res, 1, false).Val;
+}
 
 SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType EltVT = VT;
-  if (MVT::isVector(VT))
-    EltVT = MVT::getVectorElementType(VT);
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT;
+  if (VT.isVector())
+    EltVT = VT.getVectorElementType();
   std::vector<Constant*> CV;
   if (EltVT == MVT::f64) {
     Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
@@ -4389,12 +4548,12 @@
 }
 
 SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType EltVT = VT;
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT;
   unsigned EltNum = 1;
-  if (MVT::isVector(VT)) {
-    EltVT = MVT::getVectorElementType(VT);
-    EltNum = MVT::getVectorNumElements(VT);
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    EltNum = VT.getVectorNumElements();
   }
   std::vector<Constant*> CV;
   if (EltVT == MVT::f64) {
@@ -4413,7 +4572,7 @@
   SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx,
                                PseudoSourceValue::getConstantPool(), 0,
                                false, 16);
-  if (MVT::isVector(VT)) {
+  if (VT.isVector()) {
     return DAG.getNode(ISD::BIT_CONVERT, VT,
                        DAG.getNode(ISD::XOR, MVT::v2i64,
                     DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Op.getOperand(0)),
@@ -4426,16 +4585,16 @@
 SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
   SDOperand Op0 = Op.getOperand(0);
   SDOperand Op1 = Op.getOperand(1);
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType SrcVT = Op1.getValueType();
+  MVT VT = Op.getValueType();
+  MVT SrcVT = Op1.getValueType();
 
   // If second operand is smaller, extend it first.
-  if (MVT::getSizeInBits(SrcVT) < MVT::getSizeInBits(VT)) {
+  if (SrcVT.bitsLT(VT)) {
     Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1);
     SrcVT = VT;
   }
   // And if it is bigger, shrink it first.
-  if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) {
+  if (SrcVT.bitsGT(VT)) {
     Op1 = DAG.getNode(ISD::FP_ROUND, VT, Op1, DAG.getIntPtrConstant(1));
     SrcVT = VT;
   }
@@ -4462,7 +4621,7 @@
   SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);
 
   // Shift sign bit right or left if the two operands have different types.
-  if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) {
+  if (SrcVT.bitsGT(VT)) {
     // Op0 is MVT::f32, Op1 is MVT::f64.
     SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit);
     SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit,
@@ -4501,7 +4660,7 @@
   SDOperand Op1 = Op.getOperand(1);
   SDOperand CC = Op.getOperand(2);
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  bool isFP = MVT::isFloatingPoint(Op.getOperand(1).getValueType());
+  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   unsigned X86CC;
 
   if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
@@ -4549,10 +4708,10 @@
 
     SDOperand Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    MVT::ValueType VT = Op.getValueType();
+    MVT VT = Op.getValueType();
     
     bool IllegalFPCMov = false;
-    if (MVT::isFloatingPoint(VT) && !MVT::isVector(VT) &&
+    if (VT.isFloatingPoint() && !VT.isVector() &&
         !isScalarFPTypeInSSEReg(VT))  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
     
@@ -4569,7 +4728,7 @@
     Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8));
   }
 
-  const MVT::ValueType *VTs = DAG.getNodeValueTypes(Op.getValueType(),
+  const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(),
                                                     MVT::Flag);
   SmallVector<SDOperand, 4> Ops;
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
@@ -4625,16 +4784,18 @@
                                            SelectionDAG &DAG) {
   assert(Subtarget->isTargetCygMing() &&
          "This should be used only on Cygwin/Mingw targets");
-  
+
   // Get the inputs.
   SDOperand Chain = Op.getOperand(0);
   SDOperand Size  = Op.getOperand(1);
   // FIXME: Ensure alignment here
 
   SDOperand Flag;
-  
-  MVT::ValueType IntPtr = getPointerTy();
-  MVT::ValueType SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  MVT IntPtr = getPointerTy();
+  MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0));
 
   Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
   Flag = Chain.getValue(1);
@@ -4643,17 +4804,20 @@
   SDOperand Ops[] = { Chain,
                       DAG.getTargetExternalSymbol("_alloca", IntPtr),
                       DAG.getRegister(X86::EAX, IntPtr),
+                      DAG.getRegister(X86StackPtr, SPTy),
                       Flag };
-  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 4);
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 5);
   Flag = Chain.getValue(1);
 
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(0),
+                             DAG.getIntPtrConstant(0),
+                             Flag);
+
   Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
-  
-  std::vector<MVT::ValueType> Tys;
-  Tys.push_back(SPTy);
-  Tys.push_back(MVT::Other);
+
   SDOperand Ops1[2] = { Chain.getValue(0), Chain };
-  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2);
+  return DAG.getMergeValues(Ops1, 2);
 }
 
 SDOperand
@@ -4661,7 +4825,7 @@
                                            SDOperand Chain,
                                            SDOperand Dst, SDOperand Src,
                                            SDOperand Size, unsigned Align,
-                                           const Value *DstSV, uint64_t DstOff) {
+                                        const Value *DstSV, uint64_t DstSVOff) {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 
   /// If not DWORD aligned or size is more than the threshold, call the library.
@@ -4676,7 +4840,7 @@
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
     if (const char *bzeroEntry = 
           V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
-      MVT::ValueType IntPtr = getPointerTy();
+      MVT IntPtr = getPointerTy();
       const Type *IntPtrTy = getTargetData()->getIntPtrType();
       TargetLowering::ArgListTy Args; 
       TargetLowering::ArgListEntry Entry;
@@ -4698,7 +4862,7 @@
 
   uint64_t SizeVal = ConstantSize->getValue();
   SDOperand InFlag(0, 0);
-  MVT::ValueType AVT;
+  MVT AVT;
   SDOperand Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
   unsigned BytesLeft = 0;
@@ -4732,8 +4896,8 @@
         break;
     }
 
-    if (AVT > MVT::i8) {
-      unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
       Count = DAG.getIntPtrConstant(SizeVal / UBytes);
       BytesLeft = SizeVal % UBytes;
     }
@@ -4765,7 +4929,7 @@
   if (TwoRepStos) {
     InFlag = Chain.getValue(1);
     Count  = Size;
-    MVT::ValueType CVT = Count.getValueType();
+    MVT CVT = Count.getValueType();
     SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
     Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
@@ -4780,15 +4944,15 @@
   } else if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
-    MVT::ValueType AddrVT = Dst.getValueType();
-    MVT::ValueType SizeVT = Size.getValueType();
+    MVT AddrVT = Dst.getValueType();
+    MVT SizeVT = Size.getValueType();
 
     Chain = DAG.getMemset(Chain,
                           DAG.getNode(ISD::ADD, AddrVT, Dst,
                                       DAG.getConstant(Offset, AddrVT)),
                           Src,
                           DAG.getConstant(BytesLeft, SizeVT),
-                          Align, DstSV, Offset);
+                          Align, DstSV, DstSVOff + Offset);
   }
 
   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
@@ -4801,8 +4965,8 @@
                                            SDOperand Dst, SDOperand Src,
                                            SDOperand Size, unsigned Align,
                                            bool AlwaysInline,
-                                           const Value *DstSV, uint64_t DstOff,
-                                           const Value *SrcSV, uint64_t SrcOff){
+                                           const Value *DstSV, uint64_t DstSVOff,
+                                           const Value *SrcSV, uint64_t SrcSVOff){
   
   // This requires the copy size to be a constant, preferrably
   // within a subtarget-specific limit.
@@ -4813,9 +4977,7 @@
   if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
     return SDOperand();
 
-  SmallVector<SDOperand, 4> Results;
-
-  MVT::ValueType AVT;
+  MVT AVT;
   unsigned BytesLeft = 0;
   if (Align >= 8 && Subtarget->is64Bit())
     AVT = MVT::i64;
@@ -4826,7 +4988,7 @@
   else
     AVT = MVT::i8;
 
-  unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+  unsigned UBytes = AVT.getSizeInBits() / 8;
   unsigned CountVal = SizeVal / UBytes;
   SDOperand Count = DAG.getIntPtrConstant(CountVal);
   BytesLeft = SizeVal % UBytes;
@@ -4847,25 +5009,25 @@
   Ops.push_back(Chain);
   Ops.push_back(DAG.getValueType(AVT));
   Ops.push_back(InFlag);
-  Results.push_back(DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()));
+  SDOperand RepMovs = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
 
+  SmallVector<SDOperand, 4> Results;
+  Results.push_back(RepMovs);
   if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
-    MVT::ValueType DstVT = Dst.getValueType();
-    MVT::ValueType SrcVT = Src.getValueType();
-    MVT::ValueType SizeVT = Size.getValueType();
-
-    Results.push_back(DAG.getMemcpy(Chain, 
+    MVT DstVT = Dst.getValueType();
+    MVT SrcVT = Src.getValueType();
+    MVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain,
                                     DAG.getNode(ISD::ADD, DstVT, Dst,
-                                                DAG.getConstant(Offset,
-                                                                DstVT)),
+                                                DAG.getConstant(Offset, DstVT)),
                                     DAG.getNode(ISD::ADD, SrcVT, Src,
-                                                DAG.getConstant(Offset,
-                                                                SrcVT)),
+                                                DAG.getConstant(Offset, SrcVT)),
                                     DAG.getConstant(BytesLeft, SizeVT),
                                     Align, AlwaysInline,
-                                    DstSV, Offset, SrcSV, Offset));
+                                    DstSV, DstSVOff + Offset,
+                                    SrcSV, SrcSVOff + Offset));
   }
 
   return DAG.getNode(ISD::TokenFactor, MVT::Other, &Results[0], Results.size());
@@ -4886,8 +5048,7 @@
       DAG.getNode(ISD::OR, MVT::i64, rax, Tmp), rdx.getValue(1)
     };
     
-    Tys = DAG.getVTList(MVT::i64, MVT::Other);
-    return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2).Val;
+    return DAG.getMergeValues(Ops, 2).Val;
   }
   
   SDOperand eax = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
@@ -4899,8 +5060,7 @@
 
   // Use a MERGE_VALUES to return the value and chain.
   Ops[1] = edx.getValue(1);
-  Tys = DAG.getVTList(MVT::i64, MVT::Other);
-  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2).Val;
+  return DAG.getMergeValues(Ops, 2).Val;
 }
 
 SDOperand X86TargetLowering::LowerVASTART(SDOperand Op, SelectionDAG &DAG) {
@@ -4947,6 +5107,18 @@
   return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size());
 }
 
+SDOperand X86TargetLowering::LowerVAARG(SDOperand Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand SrcPtr = Op.getOperand(1);
+  SDOperand SrcSV = Op.getOperand(2);
+
+  assert(0 && "VAArgInst is not yet implemented for x86-64!");
+  abort();
+  return SDOperand();
+}
+
 SDOperand X86TargetLowering::LowerVACOPY(SDOperand Op, SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
@@ -4966,7 +5138,7 @@
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue();
   switch (IntNo) {
   default: return SDOperand();    // Don't custom lower most intrinsics.
-    // Comparison intrinsics.
+  // Comparison intrinsics.
   case Intrinsic::x86_sse_comieq_ss:
   case Intrinsic::x86_sse_comilt_ss:
   case Intrinsic::x86_sse_comile_ss:
@@ -5067,6 +5239,95 @@
                                   DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ANY_EXTEND, MVT::i32, SetCC);
   }
+
+  // Fix vector shift instructions where the last operand is a non-immediate
+  // i32 value.
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDOperand ShAmt = Op.getOperand(2);
+    if (isa<ConstantSDNode>(ShAmt))
+      return SDOperand();
+
+    unsigned NewIntNo = 0;
+    MVT ShAmtVT = MVT::v4i32;
+    switch (IntNo) {
+    case Intrinsic::x86_sse2_pslli_w:
+      NewIntNo = Intrinsic::x86_sse2_psll_w;
+      break;
+    case Intrinsic::x86_sse2_pslli_d:
+      NewIntNo = Intrinsic::x86_sse2_psll_d;
+      break;
+    case Intrinsic::x86_sse2_pslli_q:
+      NewIntNo = Intrinsic::x86_sse2_psll_q;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+      NewIntNo = Intrinsic::x86_sse2_psrl_w;
+      break;
+    case Intrinsic::x86_sse2_psrli_d:
+      NewIntNo = Intrinsic::x86_sse2_psrl_d;
+      break;
+    case Intrinsic::x86_sse2_psrli_q:
+      NewIntNo = Intrinsic::x86_sse2_psrl_q;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+      NewIntNo = Intrinsic::x86_sse2_psra_w;
+      break;
+    case Intrinsic::x86_sse2_psrai_d:
+      NewIntNo = Intrinsic::x86_sse2_psra_d;
+      break;
+    default: {
+      ShAmtVT = MVT::v2i32;
+      switch (IntNo) {
+      case Intrinsic::x86_mmx_pslli_w:
+        NewIntNo = Intrinsic::x86_mmx_psll_w;
+        break;
+      case Intrinsic::x86_mmx_pslli_d:
+        NewIntNo = Intrinsic::x86_mmx_psll_d;
+        break;
+      case Intrinsic::x86_mmx_pslli_q:
+        NewIntNo = Intrinsic::x86_mmx_psll_q;
+        break;
+      case Intrinsic::x86_mmx_psrli_w:
+        NewIntNo = Intrinsic::x86_mmx_psrl_w;
+        break;
+      case Intrinsic::x86_mmx_psrli_d:
+        NewIntNo = Intrinsic::x86_mmx_psrl_d;
+        break;
+      case Intrinsic::x86_mmx_psrli_q:
+        NewIntNo = Intrinsic::x86_mmx_psrl_q;
+        break;
+      case Intrinsic::x86_mmx_psrai_w:
+        NewIntNo = Intrinsic::x86_mmx_psra_w;
+        break;
+      case Intrinsic::x86_mmx_psrai_d:
+        NewIntNo = Intrinsic::x86_mmx_psra_d;
+        break;
+      default: abort();  // Can't reach here.
+      }
+      break;
+    }
+    }
+    MVT VT = Op.getValueType();
+    ShAmt = DAG.getNode(ISD::BIT_CONVERT, VT,
+                        DAG.getNode(ISD::SCALAR_TO_VECTOR, ShAmtVT, ShAmt));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(NewIntNo, MVT::i32),
+                       Op.getOperand(1), ShAmt);
+  }
   }
 }
 
@@ -5143,10 +5404,8 @@
     const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
     const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
 
-    const unsigned char N86R10 =
-      ((const X86RegisterInfo*)RegInfo)->getX86RegNum(X86::R10);
-    const unsigned char N86R11 =
-      ((const X86RegisterInfo*)RegInfo)->getX86RegNum(X86::R11);
+    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
+    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
@@ -5182,7 +5441,7 @@
 
     SDOperand Ops[] =
       { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 6) };
-    return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), Ops, 2);
+    return DAG.getMergeValues(Ops, 2);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -5233,8 +5492,7 @@
     Disp = DAG.getNode(ISD::SUB, MVT::i32, FPtr, Addr);
 
     const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
-    const unsigned char N86Reg =
-      ((const X86RegisterInfo*)RegInfo)->getX86RegNum(NestReg);
+    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
     OutChains[0] = DAG.getStore(Root, DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                 Trmp, TrmpAddr, 0);
 
@@ -5251,7 +5509,7 @@
 
     SDOperand Ops[] =
       { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 4) };
-    return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), Ops, 2);
+    return DAG.getMergeValues(Ops, 2);
   }
 }
 
@@ -5279,7 +5537,7 @@
   const TargetMachine &TM = MF.getTarget();
   const TargetFrameInfo &TFI = *TM.getFrameInfo();
   unsigned StackAlignment = TFI.getStackAlignment();
-  MVT::ValueType VT = Op.getValueType();
+  MVT VT = Op.getValueType();
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
@@ -5311,14 +5569,14 @@
                 DAG.getConstant(3, MVT::i16));
 
 
-  return DAG.getNode((MVT::getSizeInBits(VT) < 16 ?
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), VT, RetVal);
 }
 
 SDOperand X86TargetLowering::LowerCTLZ(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType OpVT = VT;
-  unsigned NumBits = MVT::getSizeInBits(VT);
+  MVT VT = Op.getValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -5348,9 +5606,9 @@
 }
 
 SDOperand X86TargetLowering::LowerCTTZ(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType VT = Op.getValueType();
-  MVT::ValueType OpVT = VT;
-  unsigned NumBits = MVT::getSizeInBits(VT);
+  MVT VT = Op.getValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -5375,11 +5633,13 @@
   return Op;
 }
 
-SDOperand X86TargetLowering::LowerLCS(SDOperand Op, SelectionDAG &DAG) {
-  MVT::ValueType T = cast<AtomicSDNode>(Op.Val)->getVT();
+SDOperand X86TargetLowering::LowerCMP_SWAP(SDOperand Op, SelectionDAG &DAG) {
+  MVT T = Op.getValueType();
   unsigned Reg = 0;
   unsigned size = 0;
-  switch(T) {
+  switch(T.getSimpleVT()) {
+  default:
+    assert(false && "Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   case MVT::i16: Reg = X86::AX;  size = 2; break;
   case MVT::i32: Reg = X86::EAX; size = 4; break;
@@ -5387,7 +5647,7 @@
     if (Subtarget->is64Bit()) {
       Reg = X86::RAX; size = 8;
     } else //Should go away when LowerType stuff lands
-      return SDOperand(ExpandATOMIC_LCS(Op.Val, DAG), 0);
+      return SDOperand(ExpandATOMIC_CMP_SWAP(Op.Val, DAG), 0);
     break;
   };
   SDOperand cpIn = DAG.getCopyToReg(Op.getOperand(0), Reg,
@@ -5404,9 +5664,9 @@
   return cpOut;
 }
 
-SDNode* X86TargetLowering::ExpandATOMIC_LCS(SDNode* Op, SelectionDAG &DAG) {
-  MVT::ValueType T = cast<AtomicSDNode>(Op)->getVT();
-  assert (T == MVT::i64 && "Only know how to expand i64 CAS");
+SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op, SelectionDAG &DAG) {
+  MVT T = Op->getValueType(0);
+  assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
   SDOperand cpInL, cpInH;
   cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3),
                       DAG.getConstant(0, MVT::i32));
@@ -5436,8 +5696,19 @@
                                         cpOutL.getValue(2));
   SDOperand OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   SDOperand ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2);
-  Tys = DAG.getVTList(MVT::i64, MVT::Other);
-  return DAG.getNode(ISD::MERGE_VALUES, Tys, ResultVal, cpOutH.getValue(1)).Val;
+  SDOperand Vals[2] = { ResultVal, cpOutH.getValue(1) };
+  return DAG.getMergeValues(Vals, 2).Val;
+}
+
+SDNode* X86TargetLowering::ExpandATOMIC_LOAD_SUB(SDNode* Op, SelectionDAG &DAG) {
+  MVT T = Op->getValueType(0);
+  assert (T == MVT::i32 && "Only know how to expand i32 Atomic Load Sub");
+  SDOperand negOp = DAG.getNode(ISD::SUB, T,
+                                DAG.getConstant(0, T), Op->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, Op->getOperand(0),
+                       Op->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Op)->getSrcValue(),
+                       cast<AtomicSDNode>(Op)->getAlignment()).Val;
 }
 
 /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -5445,7 +5716,7 @@
 SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
   default: assert(0 && "Should not custom lower this!");
-  case ISD::ATOMIC_LCS:         return LowerLCS(Op,DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -5471,6 +5742,7 @@
   case ISD::RET:                return LowerRET(Op, DAG);
   case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
@@ -5490,13 +5762,15 @@
   }
 }
 
-/// ExpandOperation - Provide custom lowering hooks for expanding operations.
-SDNode *X86TargetLowering::ExpandOperationResult(SDNode *N, SelectionDAG &DAG) {
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+SDNode *X86TargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) {
   switch (N->getOpcode()) {
   default: assert(0 && "Should not custom lower this!");
   case ISD::FP_TO_SINT:         return ExpandFP_TO_SINT(N, DAG);
   case ISD::READCYCLECOUNTER:   return ExpandREADCYCLECOUNTER(N, DAG);
-  case ISD::ATOMIC_LCS:         return ExpandATOMIC_LCS(N, DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return ExpandATOMIC_CMP_SWAP(N, DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return ExpandATOMIC_LOAD_SUB(N,DAG);
   }
 }
 
@@ -5546,8 +5820,12 @@
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
-  case X86ISD::LCMPXCHG_DAG:       return "x86ISD::LCMPXCHG_DAG";
-  case X86ISD::LCMPXCHG8_DAG:      return "x86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
   }
 }
 
@@ -5610,12 +5888,11 @@
   return Subtarget->is64Bit() || NumBits1 < 64;
 }
 
-bool X86TargetLowering::isTruncateFree(MVT::ValueType VT1,
-                                       MVT::ValueType VT2) const {
-  if (!MVT::isInteger(VT1) || !MVT::isInteger(VT2))
+bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
     return false;
-  unsigned NumBits1 = MVT::getSizeInBits(VT1);
-  unsigned NumBits2 = MVT::getSizeInBits(VT2);
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
   if (NumBits1 <= NumBits2)
     return false;
   return Subtarget->is64Bit() || NumBits1 < 64;
@@ -5626,9 +5903,9 @@
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool
-X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const {
+X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT VT) const {
   // Only do shuffles on 128-bit vector types for now.
-  if (MVT::getSizeInBits(VT) == 64) return false;
+  if (VT.getSizeInBits() == 64) return false;
   return (Mask.Val->getNumOperands() <= 4 ||
           isIdentityMask(Mask.Val) ||
           isIdentityMask(Mask.Val, true) ||
@@ -5642,11 +5919,10 @@
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const std::vector<SDOperand> &BVOps,
-                                          MVT::ValueType EVT,
-                                          SelectionDAG &DAG) const {
+                                          MVT EVT, SelectionDAG &DAG) const {
   unsigned NumElts = BVOps.size();
   // Only do shuffles on 128-bit vector types for now.
-  if (MVT::getSizeInBits(EVT) * NumElts == 64) return false;
+  if (EVT.getSizeInBits() * NumElts == 64) return false;
   if (NumElts == 2) return true;
   if (NumElts == 4) {
     return (isMOVLMask(&BVOps[0], 4)  ||
@@ -5661,6 +5937,195 @@
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpc,
+                                                       unsigned immOpc,
+                                                       bool invSrc) {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld  t1 = [bitinstr.addr]
+  //     op  t2 = t1, [bitinstr.val]
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz  newMBB
+  //     fallthrough -->nextMBB
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = new MachineBasicBlock(LLVM_BB);
+  F->getBasicBlockList().insert(MBBIter, newMBB);
+  F->getBasicBlockList().insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+    
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  assert(bInstr->getNumOperands() < 8 && "unexpected number of operands");
+  MachineOperand& destOper = bInstr->getOperand(0);
+  MachineOperand* argOpers[6];
+  int numArgs = bInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &bInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  int valArgIndx = 4;
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  unsigned tt = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  if (invSrc) {
+    MIB = BuildMI(newMBB, TII->get(X86::NOT32r), tt).addReg(t1);
+  }
+  else 
+    tt = t1;
+
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
+         && "invalid operand");
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, TII->get(regOpc), t2);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpc), t2);
+  MIB.addReg(tt);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX);
+  MIB.addReg(t1);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t2);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  delete bInstr;   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+                                                      MachineBasicBlock *MBB,
+                                                      unsigned cmovOpc) {
+  // For the atomic min/max operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld t1 = [min/max.addr]
+  //     mov t2 = [min/max.val] 
+  //     cmp  t1, t2
+  //     cmov[cond] t2 = t1
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz   newMBB
+  //     fallthrough -->nextMBB
+  //
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = new MachineBasicBlock(LLVM_BB);
+  F->getBasicBlockList().insert(MBBIter, newMBB);
+  F->getBasicBlockList().insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+  
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to newMBB and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
+  MachineOperand& destOper = mInstr->getOperand(0);
+  MachineOperand* argOpers[6];
+  int numArgs = mInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &mInstr->getOperand(i+1);
+  
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  int valArgIndx = 4;
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  // We only support register and immediate values
+  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
+         && "invalid operand");
+  
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);  
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
+  else 
+    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX);
+  MIB.addReg(t1);
+
+  MIB = BuildMI(newMBB, TII->get(X86::CMP32rr));
+  MIB.addReg(t1);
+  MIB.addReg(t2);
+
+  // Generate movc
+  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, TII->get(cmovOpc),t3);
+  MIB.addReg(t2);
+  MIB.addReg(t1);
+
+  // Cmp and exchange if none has modified the memory location
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t3);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  delete mInstr;   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) {
@@ -5695,15 +6160,11 @@
     MachineFunction *F = BB->getParent();
     F->getBasicBlockList().insert(It, copy0MBB);
     F->getBasicBlockList().insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
+    // Update machine-CFG edges by transferring all successors of the current
     // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-        e = BB->succ_end(); i != e; ++i)
-      sinkMBB->addSuccessor(*i);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
+    sinkMBB->transferSuccessors(BB);
+
+    // Add the true and fallthrough blocks as its successors.
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
@@ -5803,6 +6264,26 @@
     delete MI;   // The pseudo instruction is gone now.
     return BB;
   }
+  case X86::ATOMAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                                       X86::AND32ri);
+  case X86::ATOMOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 
+                                                       X86::OR32ri);
+  case X86::ATOMXOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+                                                       X86::XOR32ri);
+  case X86::ATOMNAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                               X86::AND32ri, true);
+  case X86::ATOMMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+  case X86::ATOMMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+  case X86::ATOMUMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+  case X86::ATOMUMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
   }
 }
 
@@ -5834,140 +6315,130 @@
   }
 }
 
-/// getShuffleScalarElt - Returns the scalar element that will make up the ith
-/// element of the result of the vector shuffle.
-static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {
-  MVT::ValueType VT = N->getValueType(0);
-  SDOperand PermMask = N->getOperand(2);
-  unsigned NumElems = PermMask.getNumOperands();
-  SDOperand V = (i < NumElems) ? N->getOperand(0) : N->getOperand(1);
-  i %= NumElems;
-  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    return (i == 0)
-     ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
-  } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
-    SDOperand Idx = PermMask.getOperand(i);
-    if (Idx.getOpcode() == ISD::UNDEF)
-      return DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
-    return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Idx)->getValue(),DAG);
-  }
-  return SDOperand();
-}
-
 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-/// node is a GlobalAddress + an offset.
-static bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) {
-  unsigned Opc = N->getOpcode();
-  if (Opc == X86ISD::Wrapper) {
-    if (dyn_cast<GlobalAddressSDNode>(N->getOperand(0))) {
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       GlobalValue* &GA, int64_t &Offset) const{
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
       return true;
     }
-  } else if (Opc == ISD::ADD) {
-    SDOperand N1 = N->getOperand(0);
-    SDOperand N2 = N->getOperand(1);
-    if (isGAPlusOffset(N1.Val, GA, Offset)) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
-      if (V) {
-        Offset += V->getSignExtended();
-        return true;
-      }
-    } else if (isGAPlusOffset(N2.Val, GA, Offset)) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
-      if (V) {
-        Offset += V->getSignExtended();
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/// isConsecutiveLoad - Returns true if N is loading from an address of Base
-/// + Dist * Size.
-static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
-                              MachineFrameInfo *MFI) {
-  if (N->getOperand(0).Val != Base->getOperand(0).Val)
-    return false;
-
-  SDOperand Loc = N->getOperand(1);
-  SDOperand BaseLoc = Base->getOperand(1);
-  if (Loc.getOpcode() == ISD::FrameIndex) {
-    if (BaseLoc.getOpcode() != ISD::FrameIndex)
-      return false;
-    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
-    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
-    int FS  = MFI->getObjectSize(FI);
-    int BFS = MFI->getObjectSize(BFI);
-    if (FS != BFS || FS != Size) return false;
-    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Size);
-  } else {
-    GlobalValue *GV1 = NULL;
-    GlobalValue *GV2 = NULL;
-    int64_t Offset1 = 0;
-    int64_t Offset2 = 0;
-    bool isGA1 = isGAPlusOffset(Loc.Val, GV1, Offset1);
-    bool isGA2 = isGAPlusOffset(BaseLoc.Val, GV2, Offset2);
-    if (isGA1 && isGA2 && GV1 == GV2)
-      return Offset1 == (Offset2 + Dist*Size);
   }
-
-  return false;
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
-                              const X86Subtarget *Subtarget) {
+static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
+                               const TargetLowering &TLI) {
   GlobalValue *GV;
   int64_t Offset = 0;
-  if (isGAPlusOffset(Base, GV, Offset))
-    return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
+  if (TLI.isGAPlusOffset(Base, GV, Offset))
+    return (GV->getAlignment() >= N && (Offset % N) == 0);
   // DAG combine handles the stack object case.
   return false;
 }
 
+static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask,
+                                     unsigned NumElems, MVT EVT,
+                                     SDNode *&Base,
+                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
+                                     const TargetLowering &TLI) {
+  Base = NULL;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      if (!Base)
+        return false;
+      continue;
+    }
+
+    SDOperand Elt = DAG.getShuffleScalarElt(N, i);
+    if (!Elt.Val ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val)))
+      return false;
+    if (!Base) {
+      Base = Elt.Val;
+      if (Base->getOpcode() == ISD::UNDEF)
+        return false;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (!TLI.isConsecutiveLoad(Elt.Val, Base,
+                               EVT.getSizeInBits()/8, i, MFI))
+      return false;
+  }
+  return true;
+}
 
 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
 /// if the load addresses are consecutive, non-overlapping, and in the right
 /// order.
 static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                       const X86Subtarget *Subtarget) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MVT::ValueType VT = N->getValueType(0);
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+                                       const TargetLowering &TLI) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MVT VT = N->getValueType(0);
+  MVT EVT = VT.getVectorElementType();
   SDOperand PermMask = N->getOperand(2);
-  int NumElems = (int)PermMask.getNumOperands();
+  unsigned NumElems = PermMask.getNumOperands();
   SDNode *Base = NULL;
-  for (int i = 0; i < NumElems; ++i) {
-    SDOperand Idx = PermMask.getOperand(i);
-    if (Idx.getOpcode() == ISD::UNDEF) {
-      if (!Base) return SDOperand();
-    } else {
-      SDOperand Arg =
-        getShuffleScalarElt(N, cast<ConstantSDNode>(Idx)->getValue(), DAG);
-      if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
-        return SDOperand();
-      if (!Base)
-        Base = Arg.Val;
-      else if (!isConsecutiveLoad(Arg.Val, Base,
-                                  i, MVT::getSizeInBits(EVT)/8,MFI))
-        return SDOperand();
-    }
-  }
+  if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, Base,
+                                DAG, MFI, TLI))
+    return SDOperand();
 
-  bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget);
   LoadSDNode *LD = cast<LoadSDNode>(Base);
-  if (isAlign16) {
+  if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, TLI))
     return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
                        LD->getSrcValueOffset(), LD->isVolatile());
-  } else {
-    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
-                       LD->getSrcValueOffset(), LD->isVolatile(),
-                       LD->getAlignment());
-  }
+  return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
+                     LD->getSrcValueOffset(), LD->isVolatile(),
+                     LD->getAlignment());
 }
 
+/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
+static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
+                                           const X86Subtarget *Subtarget,
+                                           const TargetLowering &TLI) {
+  unsigned NumOps = N->getNumOperands();
+
+  // Ignore single operand BUILD_VECTOR.
+  if (NumOps == 1)
+    return SDOperand();
+
+  MVT VT = N->getValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
+    // We are looking for load i64 and zero extend. We want to transform
+    // it before legalizer has a chance to expand it. Also look for i64
+    // BUILD_PAIR bit casted to f64.
+    return SDOperand();
+  // This must be an insertion into a zero vector.
+  SDOperand HighElt = N->getOperand(1);
+  if (!isZeroNode(HighElt))
+    return SDOperand();
+
+  // Value must be a load.
+  SDNode *Base = N->getOperand(0).Val;
+  if (!isa<LoadSDNode>(Base)) {
+    if (Base->getOpcode() != ISD::BIT_CONVERT)
+      return SDOperand();
+    Base = Base->getOperand(0).Val;
+    if (!isa<LoadSDNode>(Base))
+      return SDOperand();
+  }
+
+  // Transform it into VZEXT_LOAD addr.
+  LoadSDNode *LD = cast<LoadSDNode>(Base);
+  
+  // Load must not be an extload.
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDOperand();
+  
+  return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr());
+}                                           
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
 static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
@@ -6041,14 +6512,15 @@
 }
 
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
-static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG,
+static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget *Subtarget) {
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
-  if (MVT::isVector(St->getValue().getValueType()) && 
-      MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->getValue().getValueType().isVector() &&
+      St->getValue().getValueType().getSizeInBits() == 64 &&
       isa<LoadSDNode>(St->getValue()) &&
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
@@ -6093,7 +6565,7 @@
       // Otherwise, lower to two 32-bit copies.
       SDOperand LoAddr = Ld->getBasePtr();
       SDOperand HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
-                                     DAG.getConstant(MVT::i32, 4));
+                                     DAG.getConstant(4, MVT::i32));
 
       SDOperand LoLd = DAG.getLoad(MVT::i32, Ld->getChain(), LoAddr,
                                    Ld->getSrcValue(), Ld->getSrcValueOffset(),
@@ -6113,7 +6585,7 @@
 
       LoAddr = St->getBasePtr();
       HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
-                           DAG.getConstant(MVT::i32, 4));
+                           DAG.getConstant(4, MVT::i32));
 
       SDOperand LoSt = DAG.getStore(NewChain, LoLd, LoAddr,
                           St->getSrcValue(), St->getSrcValueOffset(),
@@ -6162,10 +6634,11 @@
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget);
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+  case ISD::BUILD_VECTOR:
+    return PerformBuildVectorCombine(N, DAG, Subtarget, *this);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
-  case ISD::STORE:          
-      return PerformSTORECombine(cast<StoreSDNode>(N), DAG, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
@@ -6205,17 +6678,18 @@
 /// LowerXConstraint - try to replace an X constraint, which matches anything,
 /// with another that has more specific requirements based on the type of the
 /// corresponding operand.
-void X86TargetLowering::lowerXConstraint(MVT::ValueType ConstraintVT, 
-                                         std::string& s) const {
-  if (MVT::isFloatingPoint(ConstraintVT)) {
+const char *X86TargetLowering::
+LowerXConstraint(MVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
     if (Subtarget->hasSSE2())
-      s = "Y";
-    else if (Subtarget->hasSSE1())
-      s = "x";
-    else
-      s = "f";
-  } else
-    return TargetLowering::lowerXConstraint(ConstraintVT, s);
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+  
+  return TargetLowering::LowerXConstraint(ConstraintVT);
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -6223,7 +6697,7 @@
 void X86TargetLowering::LowerAsmOperandForConstraint(SDOperand Op,
                                                      char Constraint,
                                                      std::vector<SDOperand>&Ops,
-                                                     SelectionDAG &DAG) {
+                                                     SelectionDAG &DAG) const {
   SDOperand Result(0, 0);
   
   switch (Constraint) {
@@ -6301,7 +6775,7 @@
 
 std::vector<unsigned> X86TargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT::ValueType VT) const {
+                                  MVT VT) const {
   if (Constraint.size() == 1) {
     // FIXME: not handling fp-stack yet!
     switch (Constraint[0]) {      // GCC X86 Constraint Letters
@@ -6329,7 +6803,7 @@
 
 std::pair<unsigned, const TargetRegisterClass*>
 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT::ValueType VT) const {
+                                                MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
   if (Constraint.size() == 1) {
@@ -6365,8 +6839,8 @@
       // FALL THROUGH.
     case 'x':   // SSE_REGS if SSE1 allowed
       if (!Subtarget->hasSSE1()) break;
-      
-      switch (VT) {
+
+      switch (VT.getSimpleVT()) {
       default: break;
       // Scalar SSE types.
       case MVT::f32:

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86ISelLowering.h Sun Jul  6 15:45:41 2008
@@ -181,10 +181,10 @@
       /// in order to obtain suitable precision.
       FRSQRT, FRCP,
 
-      // Thread Local Storage
+      // TLSADDR, THREAThread - Thread Local Storage.
       TLSADDR, THREAD_POINTER,
 
-      // Exception Handling helpers
+      // EH_RETURN - Exception Handling helpers.
       EH_RETURN,
       
       /// TC_RETURN - Tail call return.
@@ -194,12 +194,21 @@
       ///   operand #3 optional in flag
       TC_RETURN,
 
-      // compare and swap
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
       LCMPXCHG_DAG,
       LCMPXCHG8_DAG,
 
-      // Store FP control world into i16 memory
-      FNSTCW16m
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      // VZEXT_MOVL - Vector move low and zero extend.
+      VZEXT_MOVL,
+
+      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // VSHL, VSRL - Vector logical left / right shift.
+      VSHL, VSRL
     };
   }
 
@@ -305,7 +314,7 @@
     int BytesCallerReserves;          // Number of arg bytes caller makes.
 
   public:
-    explicit X86TargetLowering(TargetMachine &TM);
+    explicit X86TargetLowering(X86TargetMachine &TM);
 
     /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
     /// jumptable.
@@ -330,15 +339,23 @@
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
     virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+    /// determining it.
+    virtual
+    MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+                            bool isSrcConst, bool isSrcStr) const;
     
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
     virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
 
-    /// ExpandOperation - Custom lower the specified operation, splitting the
-    /// value into two pieces.
+    /// ReplaceNodeResults - Replace a node with an illegal result type
+    /// with a new node built out of custom code.
     ///
-    virtual SDNode *ExpandOperationResult(SDNode *N, SelectionDAG &DAG);
+    virtual SDNode *ReplaceNodeResults(SDNode *N, SelectionDAG &DAG);
 
     
     virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -346,12 +363,13 @@
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
                                                         MachineBasicBlock *MBB);
 
+ 
     /// getTargetNodeName - This method returns the name of a target specific
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT::ValueType getSetCCResultType(const SDOperand &) const;
+    virtual MVT getSetCCResultType(const SDOperand &) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
     /// in Mask are known to be either zero or one and return them in the 
@@ -362,6 +380,9 @@
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth = 0) const;
+
+    virtual bool
+    isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const;
     
     SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG);
 
@@ -369,17 +390,16 @@
      
     std::vector<unsigned> 
       getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                        MVT::ValueType VT) const;
+                                        MVT VT) const;
 
-    virtual void lowerXConstraint(MVT::ValueType ConstraintVT, 
-                                  std::string&) const;
+    virtual const char *LowerXConstraint(MVT ConstraintVT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
     virtual void LowerAsmOperandForConstraint(SDOperand Op,
                                               char ConstraintLetter,
                                               std::vector<SDOperand> &Ops,
-                                              SelectionDAG &DAG);
+                                              SelectionDAG &DAG) const;
     
     /// getRegForInlineAsmConstraint - Given a physical register constraint
     /// (e.g. {edx}), return the register number and the register class for the
@@ -387,7 +407,7 @@
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass*> 
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT::ValueType VT) const;
+                                   MVT VT) const;
     
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
@@ -397,26 +417,25 @@
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
     virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
-    virtual bool isTruncateFree(MVT::ValueType VT1, MVT::ValueType VT2) const;
+    virtual bool isTruncateFree(MVT VT1, MVT VT2) const;
   
     /// isShuffleMaskLegal - Targets can use this to indicate that they only
     /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
     /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
     /// values are assumed to be legal.
-    virtual bool isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const;
+    virtual bool isShuffleMaskLegal(SDOperand Mask, MVT VT) const;
 
     /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
     /// used by Targets can use this to indicate if there is a suitable
     /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
     /// pool entry.
     virtual bool isVectorClearMaskLegal(const std::vector<SDOperand> &BVOps,
-                                        MVT::ValueType EVT,
-                                        SelectionDAG &DAG) const;
+                                        MVT EVT, SelectionDAG &DAG) const;
 
     /// ShouldShrinkFPConstant - If true, then instruction selection should
     /// seek to shrink the FP constant of the specified type to a smaller type
     /// in order to save space and / or reduce runtime.
-    virtual bool ShouldShrinkFPConstant(MVT::ValueType VT) const {
+    virtual bool ShouldShrinkFPConstant(MVT VT) const {
       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
       // expensive than a straight movsd. On the other hand, it's important to
       // shrink long double fp constant since fldt is very slow.
@@ -436,7 +455,7 @@
 
     /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
     /// computed in an SSE register, not on the X87 floating point stack.
-    bool isScalarFPTypeInSSEReg(MVT::ValueType VT) const {
+    bool isScalarFPTypeInSSEReg(MVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
     }
@@ -445,7 +464,7 @@
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
-    const TargetRegisterInfo *RegInfo;
+    const X86RegisterInfo *RegInfo;
 
     /// X86StackPtr - X86 physical register used as stack ptr.
     unsigned X86StackPtr;
@@ -476,15 +495,6 @@
     SDOperand EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDOperand &OutRetAddr,
                                 SDOperand Chain, bool IsTailCall, bool Is64Bit,
                                 int FPDiff);
-    
-    bool CopyTailCallByValClobberedRegToVirtReg(bool containsByValArg,
-     SmallVector< std::pair<unsigned, unsigned>,8> &TailCallByValClobberedVRegs,
-     SmallVector<MVT::ValueType, 8> &TailCallByValClobberedVRegTypes,
-     std::pair<unsigned, SDOperand> &RegToPass,
-     SDOperand &OutChain,
-     SDOperand &OutFlag,
-     MachineFunction &MF,
-     SelectionDAG & DAG);
 
     CCAssignFn *CCAssignFnForNode(SDOperand Op) const;
     NameDecorationStyle NameDecorationForFORMAL_ARGUMENTS(SDOperand Op);
@@ -520,6 +530,7 @@
     SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVAARG(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerVACOPY(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG);
@@ -530,23 +541,41 @@
     SDOperand LowerFLT_ROUNDS_(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerCTLZ(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerCTTZ(SDOperand Op, SelectionDAG &DAG);
-    SDOperand LowerLCS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerCMP_SWAP(SDOperand Op, SelectionDAG &DAG);
     SDNode *ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG);
     SDNode *ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG);
-    SDNode *ExpandATOMIC_LCS(SDNode *N, SelectionDAG &DAG);
-
+    SDNode *ExpandATOMIC_CMP_SWAP(SDNode *N, SelectionDAG &DAG);
+    SDNode *ExpandATOMIC_LOAD_SUB(SDNode *N, SelectionDAG &DAG);
+    
     SDOperand EmitTargetCodeForMemset(SelectionDAG &DAG,
                                       SDOperand Chain,
                                       SDOperand Dst, SDOperand Src,
                                       SDOperand Size, unsigned Align,
-                                      const Value *DstSV, uint64_t DstOff);
+                                      const Value *DstSV, uint64_t DstSVOff);
     SDOperand EmitTargetCodeForMemcpy(SelectionDAG &DAG,
                                       SDOperand Chain,
                                       SDOperand Dst, SDOperand Src,
                                       SDOperand Size, unsigned Align,
                                       bool AlwaysInline,
-                                      const Value *DstSV, uint64_t DstOff,
-                                      const Value *SrcSV, uint64_t SrcOff);
+                                      const Value *DstSV, uint64_t DstSVOff,
+                                      const Value *SrcSV, uint64_t SrcSVOff);
+    
+    /// Utility function to emit atomic bitwise operations (and, or, xor).
+    // It takes the bitwise instruction to expand, the associated machine basic
+    // block, and the associated X86 opcodes for reg/reg and reg/imm.
+    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpc,
+                                                    unsigned immOpc,
+                                                    bool invSrc = false);
+    
+    /// Utility function to emit atomic min and max.  It takes the min/max
+    // instruction to expand, the associated basic block, and the associated
+    // cmov opcode for moving the min or max value.
+    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned cmovOpc);
   };
 }
 

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86Instr64bit.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86Instr64bit.td?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86Instr64bit.td (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86Instr64bit.td Sun Jul  6 15:45:41 2008
@@ -101,7 +101,7 @@
     def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
                           "call\t{*}$dst", [(X86call GR64:$dst)]>;
     def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
-                          "call\t{*}$dst", []>;
+                          "call\t{*}$dst", [(X86call (loadi64 addr:$dst))]>;
   }
 
 
@@ -199,7 +199,7 @@
 def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", []>;
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1  in {
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
                     [(set GR64:$dst, imm:$src)]>;
@@ -751,7 +751,7 @@
                   [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
 
 let isTwoAddress = 1 in {
-let isCommutable = 1 in
+let isCommutable = 1, isAsCheapAsAMove = 1 in
 def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), 
                   "xor{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
@@ -1091,7 +1091,8 @@
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 // FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
 // when we have a better way to specify isel priority.
-let Defs = [EFLAGS], AddedComplexity = 1, isReMaterializable = 1 in
+let Defs = [EFLAGS], AddedComplexity = 1,
+    isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def MOV64r0  : RI<0x31, MRMInitReg,  (outs GR64:$dst), (ins),
                  "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
                  [(set GR64:$dst, 0)]>;
@@ -1102,6 +1103,13 @@
                         "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
                         [(set GR64:$dst, i64immZExt32:$src)]>;
 
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//===----------------------------------------------------------------------===//
+
+def TLS_addr64 : I<0, Pseudo, (outs GR64:$dst), (ins i64imm:$sym),
+              ".byte\t0x66; leaq\t${sym:mem}(%rip), $dst; .word\t0x6666; rex64",
+                  [(set GR64:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>;
 
 //===----------------------------------------------------------------------===//
 // Atomic Instructions
@@ -1116,7 +1124,7 @@
 let Constraints = "$val = $dst", Defs = [EFLAGS] in {
 def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
                "lock xadd $val, $ptr", 
-               [(set GR64:$dst, (atomic_las_64 addr:$ptr, GR64:$val))]>,
+               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
                 TB, LOCK;
 def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
                   "xchg $val, $ptr", 

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrBuilder.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrBuilder.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrBuilder.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrBuilder.h Sun Jul  6 15:45:41 2008
@@ -70,15 +70,19 @@
 /// displacement. An example is: DWORD PTR [EAX + 4].
 ///
 inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
-                                               unsigned Reg, int Offset) {
-  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset);
+                                               unsigned Reg, bool isKill,
+                                               int Offset) {
+  return MIB.addReg(Reg, false, false, isKill)
+    .addImm(1).addReg(0).addImm(Offset);
 }
 
 /// addRegReg - This function is used to add a memory reference of the form:
 /// [Reg + Reg].
 inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg1, unsigned Reg2) {
-  return MIB.addReg(Reg1).addImm(1).addReg(Reg2).addImm(0);
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, false, false, isKill1).addImm(1)
+    .addReg(Reg2, false, false, isKill2).addImm(0);
 }
 
 inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.cpp Sun Jul  6 15:45:41 2008
@@ -569,8 +569,12 @@
     { X86::PMAXUBrr,        X86::PMAXUBrm },
     { X86::PMINSWrr,        X86::PMINSWrm },
     { X86::PMINUBrr,        X86::PMINUBrm },
+    { X86::PMULDQrr,        X86::PMULDQrm },
+    { X86::PMULDQrr_int,    X86::PMULDQrm_int },
     { X86::PMULHUWrr,       X86::PMULHUWrm },
     { X86::PMULHWrr,        X86::PMULHWrm },
+    { X86::PMULLDrr,        X86::PMULLDrm },
+    { X86::PMULLDrr_int,    X86::PMULLDrm_int },
     { X86::PMULLWrr,        X86::PMULLWrm },
     { X86::PMULUDQrr,       X86::PMULUDQrm },
     { X86::PORrr,           X86::PORrm },
@@ -760,7 +764,8 @@
   return TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
 }
  
-bool X86InstrInfo::isReallyTriviallyReMaterializable(MachineInstr *MI) const {
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
   default: break;
     case X86::MOV8rm:
@@ -827,6 +832,40 @@
   return true;
 }
 
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// two instructions it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 2 instructions, we will assume it's not safe.
+  for (unsigned i = 0; i < 2; ++i) {
+    if (I == MBB.end())
+      // Reached end of block, it's safe.
+      return true;
+    bool SeenDef = false;
+    for (unsigned j = 0, e = I->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isRegister())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++I;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  unsigned DestReg,
@@ -841,25 +880,33 @@
 
   // MOV32r0 etc. are implemented with xor which clobbers condition code.
   // Re-materialize them as movri instructions to avoid side effects.
+  bool Emitted = false;
   switch (Orig->getOpcode()) {
+  default: break;
   case X86::MOV8r0:
-    BuildMI(MBB, I, get(X86::MOV8ri), DestReg).addImm(0);
-    break;
   case X86::MOV16r0:
-    BuildMI(MBB, I, get(X86::MOV16ri), DestReg).addImm(0);
-    break;
   case X86::MOV32r0:
-    BuildMI(MBB, I, get(X86::MOV32ri), DestReg).addImm(0);
-    break;
-  case X86::MOV64r0:
-    BuildMI(MBB, I, get(X86::MOV64ri32), DestReg).addImm(0);
+  case X86::MOV64r0: {
+    if (!isSafeToClobberEFLAGS(MBB, I)) {
+      unsigned Opc = 0;
+      switch (Orig->getOpcode()) {
+      default: break;
+      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
+      case X86::MOV16r0: Opc = X86::MOV16ri; break;
+      case X86::MOV32r0: Opc = X86::MOV32ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri32; break;
+      }
+      BuildMI(MBB, I, get(Opc), DestReg).addImm(0);
+      Emitted = true;
+    }
     break;
-  default: {
+  }
+  }
+
+  if (!Emitted) {
     MachineInstr *MI = Orig->clone();
     MI->getOperand(0).setReg(DestReg);
     MBB.insert(I, MI);
-    break;
-  }
   }
 
   if (ChangeSubIdx) {
@@ -931,11 +978,13 @@
 MachineInstr *
 X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                     MachineBasicBlock::iterator &MBBI,
-                                    LiveVariables &LV) const {
+                                    LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   // All instructions input are two-addr instructions.  Get the known operands.
   unsigned Dest = MI->getOperand(0).getReg();
   unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
 
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -948,51 +997,47 @@
     assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
     if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
     
-    unsigned A = MI->getOperand(0).getReg();
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
-    unsigned M = MI->getOperand(3).getImm();
     if (B != C) return 0;
-    NewMI = BuildMI(get(X86::PSHUFDri), A).addReg(B).addImm(M);
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned M = MI->getOperand(3).getImm();
+    NewMI = BuildMI(get(X86::PSHUFDri)).addReg(A, true, false, false, isDead)
+      .addReg(B, false, false, isKill).addImm(M);
     break;
   }
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
     // the flags produced by a shift yet, so this is safe.
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
     unsigned ShAmt = MI->getOperand(2).getImm();
     if (ShAmt == 0 || ShAmt >= 4) return 0;
-    
-    NewMI = BuildMI(get(X86::LEA64r), Dest)
-      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+
+    NewMI = BuildMI(get(X86::LEA64r)).addReg(Dest, true, false, false, isDead)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src, false, false, isKill).addImm(0);
     break;
   }
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
     // the flags produced by a shift yet, so this is safe.
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
     unsigned ShAmt = MI->getOperand(2).getImm();
     if (ShAmt == 0 || ShAmt >= 4) return 0;
-    
+
     unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
       X86::LEA64_32r : X86::LEA32r;
-    NewMI = BuildMI(get(Opc), Dest)
-      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    NewMI = BuildMI(get(Opc)).addReg(Dest, true, false, false, isDead)
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, false, false, isKill).addImm(0);
     break;
   }
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
     // the flags produced by a shift yet, so this is safe.
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
     unsigned ShAmt = MI->getOperand(2).getImm();
     if (ShAmt == 0 || ShAmt >= 4) return 0;
-    
+
     if (DisableLEA16) {
       // If 16-bit LEA is disabled, use 32-bit LEA via subregisters.
       MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
@@ -1003,31 +1048,36 @@
             
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits. 
-      MachineInstr *Undef = BuildMI(get(X86::IMPLICIT_DEF), leaInReg);
+      MachineInstr *Undef = BuildMI(get(X86::IMPLICIT_DEF), leaInReg);      
+      MachineInstr *InsMI =  BuildMI(get(X86::INSERT_SUBREG),leaInReg)
+        .addReg(leaInReg).addReg(Src, false, false, isKill)
+        .addImm(X86::SUBREG_16BIT);
       
-      MachineInstr *Ins = 
-       BuildMI(get(X86::INSERT_SUBREG),leaInReg)
-                    .addReg(leaInReg).addReg(Src).addImm(X86::SUBREG_16BIT);
+      NewMI = BuildMI(get(Opc), leaOutReg).addReg(0).addImm(1 << ShAmt)
+        .addReg(leaInReg, false, false, true).addImm(0);
       
-      NewMI = BuildMI(get(Opc), leaOutReg)
-        .addReg(0).addImm(1 << ShAmt).addReg(leaInReg).addImm(0);
-      
-      MachineInstr *Ext =
-        BuildMI(get(X86::EXTRACT_SUBREG), Dest)
-         .addReg(leaOutReg).addImm(X86::SUBREG_16BIT);
-      Ext->copyKillDeadInfo(MI);
+      MachineInstr *ExtMI = BuildMI(get(X86::EXTRACT_SUBREG))
+        .addReg(Dest, true, false, false, isDead)
+        .addReg(leaOutReg, false, false, true).addImm(X86::SUBREG_16BIT);
       
       MFI->insert(MBBI, Undef);
-      MFI->insert(MBBI, Ins);            // Insert the insert_subreg
-      LV.instructionChanged(MI, NewMI);  // Update live variables
-      LV.addVirtualRegisterKilled(leaInReg, NewMI);
-      MFI->insert(MBBI, NewMI);          // Insert the new inst
-      LV.addVirtualRegisterKilled(leaOutReg, Ext);
-      MFI->insert(MBBI, Ext);            // Insert the extract_subreg      
-      return Ext;
+      MFI->insert(MBBI, InsMI);            // Insert the insert_subreg
+      MFI->insert(MBBI, NewMI);            // Insert the lea inst
+      MFI->insert(MBBI, ExtMI);            // Insert the extract_subreg      
+      if (LV) {
+        // Update live variables
+        LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+        LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+        if (isKill)
+          LV->replaceKillInstruction(Src, MI, InsMI);
+        if (isDead)
+          LV->replaceKillInstruction(Dest, MI, ExtMI);
+      }
+      return ExtMI;
     } else {
-      NewMI = BuildMI(get(X86::LEA16r), Dest)
-        .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+      NewMI = BuildMI(get(X86::LEA16r)).addReg(Dest, true, false, false, isDead)
+        .addReg(0).addImm(1 << ShAmt)
+        .addReg(Src, false, false, isKill).addImm(0);
     }
     break;
   }
@@ -1046,58 +1096,79 @@
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addRegOffset(BuildMI(get(Opc), Dest), Src, 1);
+      NewMI = addRegOffset(BuildMI(get(Opc))
+                           .addReg(Dest, true, false, false, isDead),
+                           Src, isKill, 1);
       break;
     }
     case X86::INC16r:
     case X86::INC64_16r:
       if (DisableLEA16) return 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, 1);
+      NewMI = addRegOffset(BuildMI(get(X86::LEA16r))
+                           .addReg(Dest, true, false, false, isDead),
+                           Src, isKill, 1);
       break;
     case X86::DEC64r:
     case X86::DEC32r: {
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addRegOffset(BuildMI(get(Opc), Dest), Src, -1);
+      NewMI = addRegOffset(BuildMI(get(Opc))
+                           .addReg(Dest, true, false, false, isDead),
+                           Src, isKill, -1);
       break;
     }
     case X86::DEC16r:
     case X86::DEC64_16r:
       if (DisableLEA16) return 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, -1);
+      NewMI = addRegOffset(BuildMI(get(X86::LEA16r))
+                           .addReg(Dest, true, false, false, isDead),
+                           Src, isKill, -1);
       break;
     case X86::ADD64rr:
     case X86::ADD32rr: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addRegReg(BuildMI(get(Opc), Dest), Src,
-                        MI->getOperand(2).getReg());
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(get(Opc))
+                        .addReg(Dest, true, false, false, isDead),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
     }
-    case X86::ADD16rr:
+    case X86::ADD16rr: {
       if (DisableLEA16) return 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegReg(BuildMI(get(X86::LEA16r), Dest), Src,
-                        MI->getOperand(2).getReg());
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(get(X86::LEA16r))
+                        .addReg(Dest, true, false, false, isDead),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
+    }
     case X86::ADD64ri32:
     case X86::ADD64ri8:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       if (MI->getOperand(2).isImmediate())
-        NewMI = addRegOffset(BuildMI(get(X86::LEA64r), Dest), Src,
-                             MI->getOperand(2).getImm());
+        NewMI = addRegOffset(BuildMI(get(X86::LEA64r))
+                             .addReg(Dest, true, false, false, isDead),
+                             Src, isKill, MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
     case X86::ADD32ri8:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       if (MI->getOperand(2).isImmediate()) {
         unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        NewMI = addRegOffset(BuildMI(get(Opc), Dest), Src,
-                             MI->getOperand(2).getImm());
+        NewMI = addRegOffset(BuildMI(get(Opc))
+                             .addReg(Dest, true, false, false, isDead),
+                             Src, isKill, MI->getOperand(2).getImm());
       }
       break;
     case X86::ADD16ri:
@@ -1105,8 +1176,9 @@
       if (DisableLEA16) return 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       if (MI->getOperand(2).isImmediate())
-        NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src,
-                             MI->getOperand(2).getImm());
+        NewMI = addRegOffset(BuildMI(get(X86::LEA16r))
+                             .addReg(Dest, true, false, false, isDead),
+                             Src, isKill, MI->getOperand(2).getImm());
       break;
     case X86::SHL16ri:
       if (DisableLEA16) return 0;
@@ -1122,7 +1194,10 @@
         unsigned Opc = MIOpc == X86::SHL64ri ? X86::LEA64r
           : (MIOpc == X86::SHL32ri
              ? (is64Bit ? X86::LEA64_32r : X86::LEA32r) : X86::LEA16r);
-        NewMI = addFullAddress(BuildMI(get(Opc), Dest), AM);
+        NewMI = addFullAddress(BuildMI(get(Opc))
+                               .addReg(Dest, true, false, false, isDead), AM);
+        if (isKill)
+          NewMI->getOperand(3).setIsKill(true);
       }
       break;
     }
@@ -1132,8 +1207,13 @@
 
   if (!NewMI) return 0;
 
-  NewMI->copyKillDeadInfo(MI);
-  LV.instructionChanged(MI, NewMI);  // Update live variables
+  if (LV) {  // Update live variables
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, NewMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, NewMI);
+  }
+
   MFI->insert(MBBI, NewMI);          // Insert the new inst    
   return NewMI;
 }
@@ -1141,7 +1221,8 @@
 /// commuteInstruction - We have a few instructions that must be hacked on to
 /// commute them.
 ///
-MachineInstr *X86InstrInfo::commuteInstruction(MachineInstr *MI) const {
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   switch (MI->getOpcode()) {
   case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
   case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
@@ -1164,6 +1245,7 @@
     unsigned A = MI->getOperand(0).getReg();
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
+    bool AisDead = MI->getOperand(0).isDead();
     bool BisKill = MI->getOperand(1).isKill();
     bool CisKill = MI->getOperand(2).isKill();
     // If machine instrs are no longer in two-address forms, update
@@ -1175,7 +1257,8 @@
       A = C;
       CisKill = false;
     }
-    return BuildMI(get(Opc), A).addReg(C, false, false, CisKill)
+    return BuildMI(get(Opc)).addReg(A, true, false, false, AisDead)
+      .addReg(C, false, false, CisKill)
       .addReg(B, false, false, BisKill).addImm(Size-Amt);
   }
   case X86::CMOVB16rr:
@@ -1271,7 +1354,7 @@
     // Fallthrough intended.
   }
   default:
-    return TargetInstrInfoImpl::commuteInstruction(MI);
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
   }
 }
 
@@ -1455,7 +1538,7 @@
                                                      MachineOperand &MO) {
   if (MO.isRegister())
     MIB = MIB.addReg(MO.getReg(), MO.isDef(), MO.isImplicit(),
-                     false, false, MO.getSubReg());
+                     MO.isKill(), MO.isDead(), MO.getSubReg());
   else if (MO.isImmediate())
     MIB = MIB.addImm(MO.getImm());
   else if (MO.isFrameIndex())
@@ -1717,8 +1800,8 @@
 }
 
 void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                      SmallVectorImpl<MachineOperand> &Addr,
-                                      const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
   unsigned Opc = getLoadRegOpcode(RC, RI.getStackAlignment());
   MachineInstrBuilder MIB = BuildMI(get(Opc), DestReg);
@@ -1854,10 +1937,8 @@
       NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
     else if (MI->getOpcode() == X86::MOV8r0)
       NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
-    if (NewMI) {
-      NewMI->copyKillDeadInfo(MI);
+    if (NewMI)
       return NewMI;
-    }
     
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (i == 1) {
@@ -1876,7 +1957,6 @@
         NewMI = FuseTwoAddrInst(I->second, MOs, MI, *this);
       else
         NewMI = FuseInst(I->second, i, MOs, MI, *this);
-      NewMI->copyKillDeadInfo(MI);
       return NewMI;
     }
   }
@@ -2188,14 +2268,14 @@
   // Emit the load instruction.
   SDNode *Load = 0;
   if (FoldedLoad) {
-    MVT::ValueType VT = *RC->vt_begin();
+    MVT VT = *RC->vt_begin();
     Load = DAG.getTargetNode(getLoadRegOpcode(RC, RI.getStackAlignment()), VT,
                              MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
   }
 
   // Emit the data processing instruction.
-  std::vector<MVT::ValueType> VTs;
+  std::vector<MVT> VTs;
   const TargetRegisterClass *DstRC = 0;
   if (TID.getNumDefs() > 0) {
     const TargetOperandInfo &DstTOI = TID.OpInfo[0];
@@ -2204,7 +2284,7 @@
     VTs.push_back(*DstRC->vt_begin());
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    MVT::ValueType VT = N->getValueType(i);
+    MVT VT = N->getValueType(i);
     if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
       VTs.push_back(VT);
   }
@@ -2633,7 +2713,8 @@
       FinalSize += AI->getInlineAsmLength(AsmStr);
       break;
     }
-    case TargetInstrInfo::LABEL:
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
       break;
     case TargetInstrInfo::IMPLICIT_DEF:
     case TargetInstrInfo::DECLARE:
@@ -2819,7 +2900,7 @@
 unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   const TargetInstrDesc &Desc = MI->getDesc();
   bool IsPIC = (TM.getRelocationModel() == Reloc::PIC_);
-  bool Is64BitMode = ((X86Subtarget*)TM.getSubtargetImpl())->is64Bit();
+  bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
   unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
   if (Desc.getOpcode() == X86::MOVPC32r) {
     Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.h Sun Jul  6 15:45:41 2008
@@ -227,6 +227,23 @@
   };
 }
 
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImmediate() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFrameIndex()) return true;
+  return Op+4 <= MI->getNumOperands() &&
+    MI->getOperand(Op  ).isRegister() && isScale(MI->getOperand(Op+1)) &&
+    MI->getOperand(Op+2).isRegister() &&
+    (MI->getOperand(Op+3).isImmediate() ||
+     MI->getOperand(Op+3).isGlobalAddress() ||
+     MI->getOperand(Op+3).isConstantPoolIndex() ||
+     MI->getOperand(Op+3).isJumpTableIndex());
+}
+
 class X86InstrInfo : public TargetInstrInfoImpl {
   X86TargetMachine &TM;
   const X86RegisterInfo RI;
@@ -250,7 +267,7 @@
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
   // Return true if the instruction is a register to register move and
   // leave the source and dest operands in the passed parameters.
@@ -260,7 +277,7 @@
   unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
 
-  bool isReallyTriviallyReMaterializable(MachineInstr *MI) const;
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI) const;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, const MachineInstr *Orig) const;
 
@@ -278,12 +295,12 @@
   ///
   virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
                                               MachineBasicBlock::iterator &MBBI,
-                                              LiveVariables &LV) const;
+                                              LiveVariables *LV) const;
 
   /// commuteInstruction - We have a few instructions that must be hacked on to
   /// commute them.
   ///
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI) const;
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
 
   // Branch analysis.
   virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.td?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.td (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrInfo.td Sun Jul  6 15:45:41 2008
@@ -45,7 +45,7 @@
 def SDT_X86CallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                          SDTCisVT<1, i32> ]>;
 
-def SDT_X86Call   : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 
 def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
 
@@ -109,7 +109,7 @@
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 
 def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
-                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
 def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;
 
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
@@ -229,9 +229,35 @@
 }]>;
 
 // Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (ld node:$ptr)), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    if (LD->getAddressingMode() != ISD::UNINDEXED)
+      return false;
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (ExtType == ISD::NON_EXTLOAD)
+      return true;
+    if (ExtType == ISD::EXTLOAD)
+      return LD->getAlignment() >= 2 && !LD->isVolatile();
+  }
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    if (LD->getAddressingMode() != ISD::UNINDEXED)
+      return false;
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (ExtType == ISD::NON_EXTLOAD)
+      return true;
+    if (ExtType == ISD::EXTLOAD)
+      return LD->getAlignment() >= 4 && !LD->isVolatile();
+  }
+  return false;
+}]>;
+
 def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
-def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>;
-def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>;
 def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
 
 def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
@@ -298,7 +324,7 @@
     hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
   def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
-                    [/*(X86retflag 0)*/ /*FIXME: Disabled: rdar://5791600*/]>;
+                    [(X86retflag 0)]>;
   def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag imm:$amt)]>;
@@ -371,7 +397,7 @@
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
                         "call\t{*}$dst", [(X86call GR32:$dst)]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
-                        "call\t{*}$dst", []>;
+                        "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
   }
 
 // Tail call stuff.
@@ -391,6 +417,7 @@
                  []>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+
   def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call}  # TAILCALL",
                  []>;
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
@@ -551,7 +578,7 @@
 def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", []>;
 }
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
                    [(set GR8:$dst, imm:$src)]>;
@@ -1282,23 +1309,23 @@
   def OR32mi8  : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$src),
                  "or{l}\t{$src, $dst|$dst, $src}",
                  [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
-}
+} // isTwoAddress = 0
 
 
-let isCommutable = 1 in {   // X = XOR Y, Z   --> X = XOR Z, Y
-def XOR8rr   : I<0x30, MRMDestReg,
-                 (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
-                 "xor{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>;
-def XOR16rr  : I<0x31, MRMDestReg, 
-                 (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
-                 "xor{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize;
-def XOR32rr  : I<0x31, MRMDestReg, 
-                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
-                 "xor{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
-}
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+  def XOR8rr   : I<0x30, MRMDestReg,
+                   (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                   "xor{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>;
+  def XOR16rr  : I<0x31, MRMDestReg, 
+                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize;
+  def XOR32rr  : I<0x31, MRMDestReg, 
+                   (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
+                   "xor{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+} // isCommutable = 1
 
 def XOR8rm   : I<0x32, MRMSrcMem , 
                  (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), 
@@ -1307,7 +1334,8 @@
 def XOR16rm  : I<0x33, MRMSrcMem , 
                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), 
                  "xor{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>, OpSize;
+                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>,
+                 OpSize;
 def XOR32rm  : I<0x33, MRMSrcMem , 
                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), 
                  "xor{l}\t{$src2, $dst|$dst, $src2}",
@@ -1334,6 +1362,7 @@
                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                    "xor{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2))]>;
+
 let isTwoAddress = 0 in {
   def XOR8mr   : I<0x30, MRMDestMem,
                    (outs), (ins i8mem :$dst, GR8 :$src),
@@ -1370,7 +1399,7 @@
                      (outs), (ins i32mem:$dst, i32i8imm :$src),
                      "xor{l}\t{$src, $dst|$dst, $src}",
                  [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
-}
+} // isTwoAddress = 0
 } // Defs = [EFLAGS]
 
 // Shift instructions
@@ -1385,7 +1414,7 @@
 def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
                  "shl{l}\t{%cl, $dst|$dst, %CL}",
                  [(set GR32:$dst, (shl GR32:$src, CL))]>;
-}
+} // Uses = [CL]
 
 def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
                    "shl{b}\t{$src2, $dst|$dst, $src2}",
@@ -1399,7 +1428,7 @@
                    [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
 // NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
 // cheaper.
-}
+} // isConvertibleToThreeAddress = 1
 
 let isTwoAddress = 0 in {
   let Uses = [CL] in {
@@ -2455,7 +2484,7 @@
 
 // Alias instructions that map movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
-let Defs = [EFLAGS], isReMaterializable = 1 in {
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins),
                  "xor{b}\t$dst, $dst",
                  [(set GR8:$dst, 0)]>;
@@ -2499,12 +2528,12 @@
 //
 
 let Uses = [EBX] in
-def TLS_addr : I<0, Pseudo, (outs GR32:$dst), (ins i32imm:$sym),
-               "leal\t${sym:mem}(,%ebx,1), $dst",
-               [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>;
+def TLS_addr32 : I<0, Pseudo, (outs GR32:$dst), (ins i32imm:$sym),
+                  "leal\t${sym:mem}(,%ebx,1), $dst",
+                  [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>;
 
 let AddedComplexity = 10 in
-def TLS_gs_rr : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
+def TLS_gs_rr  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
                   "movl\t%gs:($src), $dst",
                   [(set GR32:$dst, (load (add X86TLStp, GR32:$src)))]>;
 
@@ -2585,18 +2614,48 @@
 let Constraints = "$val = $dst", Defs = [EFLAGS] in {
 def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
                "lock xadd{l}\t{$val, $ptr|$ptr, $val}", 
-               [(set GR32:$dst, (atomic_las_32 addr:$ptr, GR32:$val))]>,
+               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
                 TB, LOCK;
 def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
                "lock xadd{w}\t{$val, $ptr|$ptr, $val}", 
-               [(set GR16:$dst, (atomic_las_16 addr:$ptr, GR16:$val))]>,
+               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
                 TB, OpSize, LOCK;
 def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
                "lock xadd{b}\t{$val, $ptr|$ptr, $val}", 
-               [(set GR8:$dst, (atomic_las_8 addr:$ptr, GR8:$val))]>,
+               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
                 TB, LOCK;
 }
 
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMAND32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_and addr:$ptr, GR32:$val))]>;
+def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMOR32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_or addr:$ptr, GR32:$val))]>;
+def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMXOR32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_xor addr:$ptr, GR32:$val))]>;
+def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMNAND32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_nand addr:$ptr, GR32:$val))]>;
+
+def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "#ATOMMIN32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_min addr:$ptr, GR32:$val))]>;
+def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMMAX32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_max addr:$ptr, GR32:$val))]>;
+def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMIN32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_umin addr:$ptr, GR32:$val))]>;
+def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMAX32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_umax addr:$ptr, GR32:$val))]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
@@ -2758,13 +2817,13 @@
 include "X86Instr64bit.td"
 
 //===----------------------------------------------------------------------===//
-// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+// XMM Floating point support (requires SSE / SSE2)
 //===----------------------------------------------------------------------===//
 
-include "X86InstrMMX.td"
+include "X86InstrSSE.td"
 
 //===----------------------------------------------------------------------===//
-// XMM Floating point support (requires SSE / SSE2)
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
 //===----------------------------------------------------------------------===//
 
-include "X86InstrSSE.td"
+include "X86InstrMMX.td"

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrMMX.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrMMX.td?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrMMX.td (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrMMX.td Sun Jul  6 15:45:41 2008
@@ -118,7 +118,8 @@
   }
 
   multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
-                                string OpcodeStr, Intrinsic IntId> {
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2> {
     def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                                   (ins VR64:$src1, VR64:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -131,11 +132,7 @@
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId VR64:$src1,
-                             (v1i64 (bitconvert
-                                     (v2i32 (vector_shuffle immAllZerosV,
-                                     (v2i32 (scalar_to_vector (i32 imm:$src2))),
-                                             MMX_MOVL_shuffle_mask))))))]>;
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
   }
 }
 
@@ -167,7 +164,7 @@
                              "movd\t{$src, $dst|$dst, $src}", []>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVD64from64rr : MMXRI<0x6E, MRMSrcReg, (outs GR64:$dst), (ins VR64:$src),
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMSrcReg, (outs GR64:$dst), (ins VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}", []>;
 
 let neverHasSideEffects = 1 in
@@ -184,13 +181,16 @@
 def MMX_MOVDQ2Qrr : MMXID<0xD6, MRMDestMem, (outs VR64:$dst), (ins VR128:$src),
                           "movdq2q\t{$src, $dst|$dst, $src}",
                           [(set VR64:$dst,
-                            (v1i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))]>;
+                            (v1i64 (bitconvert
+                            (i64 (vector_extract (v2i64 VR128:$src),
+                                  (iPTR 0))))))]>;
 
 def MMX_MOVQ2DQrr : MMXIS<0xD6, MRMDestMem, (outs VR128:$dst), (ins VR64:$src),
                           "movq2dq\t{$src, $dst|$dst, $src}",
-                          [(set VR128:$dst,
-                            (bitconvert (v1i64 VR64:$src)))]>;
+          [(set VR128:$dst,
+                (v2i64 (vector_shuffle immAllZerosV,
+                        (v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src)))),
+                        MOVL_shuffle_mask)))]>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
@@ -200,18 +200,14 @@
 // movd to MMX register zero-extends
 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                               (v2i32 (vector_shuffle immAllZerosV,
-                                       (v2i32 (scalar_to_vector GR32:$src)),
-                                       MMX_MOVL_shuffle_mask)))]>;
+              [(set VR64:$dst,
+                    (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
 let AddedComplexity = 20 in
 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                               (v2i32 (vector_shuffle immAllZerosV,
-                                       (v2i32 (scalar_to_vector
-                                               (loadi32 addr:$src))),
-                                       MMX_MOVL_shuffle_mask)))]>;
+          [(set VR64:$dst,
+                (v2i32 (X86vzmovl (v2i32
+                                   (scalar_to_vector (loadi32 addr:$src))))))]>;
 
 // Arithmetic Instructions
 
@@ -280,23 +276,29 @@
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
-                                    int_x86_mmx_psrl_w>;
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
 defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
-                                    int_x86_mmx_psrl_d>;
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
 defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
-                                    int_x86_mmx_psrl_q>;
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
 
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
-                                    int_x86_mmx_psll_w>;
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
 defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
-                                    int_x86_mmx_psll_d>;
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
 defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
-                                    int_x86_mmx_psll_q>;
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
 
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
-                                    int_x86_mmx_psra_w>;
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
 defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
-                                    int_x86_mmx_psra_d>;
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
+def : Pat<(v1i64 (X86vshr     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
 
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
@@ -526,20 +528,30 @@
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 def : Pat<(store (v2i32 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2f32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 def : Pat<(store (v1i64 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 
 // Bit convert.
 def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  VR64:$src))), (v2f32 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
 
@@ -548,6 +560,8 @@
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
@@ -556,6 +570,8 @@
           (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
@@ -564,14 +580,10 @@
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
-                    (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
-                    MMX_MOVL_shuffle_mask)),
-            (MMX_MOVZDI2PDIrr GR32:$src)>;
-  def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
-                    (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
-                    MMX_MOVL_shuffle_mask)),
-            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
+  def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
 }
 
 // Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
@@ -635,3 +647,19 @@
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+
+// Move MMX to lower 64-bit of XMM
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src)))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+// Move lower 64-bit of XMM to MMX.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86InstrSSE.td?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86InstrSSE.td Sun Jul  6 15:45:41 2008
@@ -47,6 +47,12 @@
 def X86insrtps : SDNode<"X86ISD::INSERTPS", 
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
+def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
@@ -157,6 +163,22 @@
   return getI32Imm(N->getValue() >> 3);
 }]>;
 
+def SSE_CC_imm  : SDNodeXForm<cond, [{
+  unsigned Val;
+  switch (N->get()) {
+  default: Val = 0; assert(0 && "Unexpected CondCode"); break;
+  case ISD::SETOEQ: Val = 0; break;
+  case ISD::SETOLT: Val = 1; break;
+  case ISD::SETOLE: Val = 2; break;
+  case ISD::SETUO:  Val = 3; break;
+  case ISD::SETONE: Val = 4; break;
+  case ISD::SETOGE: Val = 5; break;
+  case ISD::SETOGT: Val = 6; break;
+  case ISD::SETO:   Val = 7; break;
+  }
+  return getI8Imm(Val);
+}]>;
+
 // SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
 // SHUFP* etc. imm.
 def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
@@ -251,6 +273,7 @@
   return X86::isSHUFPMask(N);
 }], SHUFFLE_get_shuf_imm>;
 
+
 //===----------------------------------------------------------------------===//
 // SSE scalar FP Instructions
 //===----------------------------------------------------------------------===//
@@ -521,31 +544,36 @@
   }
 
   // Scalar operation, reg+mem.
-  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
                  !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                  [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
                  
   // Vector operation, reg+reg.
-  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
     let isCommutable = Commutable;
   }
 
   // Vector operation, reg+mem.
-  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
                  !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
 
   // Intrinsic operation, reg+reg.
-  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
   // Intrinsic operation, reg+mem.
-  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1,
                                                sse_load_f32:$src2))]>;
@@ -582,46 +610,53 @@
   }
 
   // Scalar operation, reg+mem.
-  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
                  !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                  [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
                  
   // Vector operation, reg+reg.
-  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
     let isCommutable = Commutable;
   }
 
   // Vector operation, reg+mem.
-  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
                  !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
 
   // Intrinsic operation, reg+reg.
-  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
   // Intrinsic operation, reg+mem.
-  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1,
                                                sse_load_f32:$src2))]>;
 
   // Vector intrinsic operation, reg+reg.
-  def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
   // Vector intrinsic operation, reg+mem.
-  def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+  def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
                      !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
+           [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
 }
 }
 
@@ -671,20 +706,21 @@
     def MOVLPSrm : PSI<0x12, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movlps\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, 
-                         (v4f32 (vector_shuffle VR128:$src1,
-                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
-                                 MOVLP_shuffle_mask)))]>;
+       [(set VR128:$dst, 
+             (v4f32 (vector_shuffle VR128:$src1,
+                     (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                     MOVLP_shuffle_mask)))]>;
     def MOVHPSrm : PSI<0x16, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movhps\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, 
-                         (v4f32 (vector_shuffle VR128:$src1,
-                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
-                                 MOVHP_shuffle_mask)))]>;
+       [(set VR128:$dst, 
+             (v4f32 (vector_shuffle VR128:$src1,
+                     (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                     MOVHP_shuffle_mask)))]>;
   } // AddedComplexity
 } // Constraints = "$src1 = $dst"
 
+
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -783,7 +819,7 @@
   // Vector intrinsic operation, mem
   def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
 }
 
 // Square root.
@@ -850,16 +886,20 @@
 
 let Constraints = "$src1 = $dst" in {
   def CMPPSrri : PSIi8<0xC2, MRMSrcReg, 
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
-                      "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
-                                         VR128:$src, imm:$cc))]>;
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                    "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                                        VR128:$src, imm:$cc))]>;
   def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, 
-                      (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
-                      "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
-                                         (load addr:$src), imm:$cc))]>;
-}
+                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                            (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v4i32 (vsetcc (v4f32 VR128:$src1), VR128:$src2, cond:$cc)),
+          (CMPPSrri VR128:$src1, VR128:$src2, (SSE_CC_imm cond:$cc))>;
+def : Pat<(v4i32 (vsetcc (v4f32 VR128:$src1), (memop addr:$src2), cond:$cc)),
+          (CMPPSrmi VR128:$src1, addr:$src2, (SSE_CC_imm cond:$cc))>;
 
 // Shuffle and unpack instructions
 let Constraints = "$src1 = $dst" in {
@@ -1007,10 +1047,11 @@
 let AddedComplexity = 20 in
 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
                       "movss\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
-                                 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
-                                                MOVL_shuffle_mask)))]>;
+                   [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
+                                                      (loadf32 addr:$src))))))]>;
 
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+          (MOVZSS2PSrm addr:$src)>;
 
 //===----------------------------------------------------------------------===//
 // SSE2 Instructions
@@ -1074,14 +1115,14 @@
 def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
                          "cvtpd2pi\t{$src, $dst|$dst, $src}",
                          [(set VR64:$dst, (int_x86_sse_cvtpd2pi 
-                                           (load addr:$src)))]>;
+                                           (memop addr:$src)))]>;
 def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                          "cvttpd2pi\t{$src, $dst|$dst, $src}",
                          [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
 def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
                          "cvttpd2pi\t{$src, $dst|$dst, $src}",
                          [(set VR64:$dst, (int_x86_sse_cvttpd2pi 
-                                           (load addr:$src)))]>;
+                                           (memop addr:$src)))]>;
 def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                          "cvtpi2pd\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
@@ -1180,26 +1221,32 @@
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
 let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in {
-  def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+  def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
                       "andpd\t{$src2, $dst|$dst, $src2}",
                       [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
-  def FsORPDrr  : PDI<0x56, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+  def FsORPDrr  : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
                       "orpd\t{$src2, $dst|$dst, $src2}",
                       [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
-  def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+  def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
                       "xorpd\t{$src2, $dst|$dst, $src2}",
                       [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
 }
 
-def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
                     "andpd\t{$src2, $dst|$dst, $src2}",
                     [(set FR64:$dst, (X86fand FR64:$src1,
                                       (memopfsf64 addr:$src2)))]>;
-def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
                     "orpd\t{$src2, $dst|$dst, $src2}",
                     [(set FR64:$dst, (X86for FR64:$src1,
                                       (memopfsf64 addr:$src2)))]>;
-def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
                     "xorpd\t{$src2, $dst|$dst, $src2}",
                     [(set FR64:$dst, (X86fxor FR64:$src1,
                                       (memopfsf64 addr:$src2)))]>;
@@ -1298,46 +1345,54 @@
   }
 
   // Scalar operation, reg+mem.
-  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2),
+  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                                 (ins FR64:$src1, f64mem:$src2),
                  !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
                  [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
                  
   // Vector operation, reg+reg.
-  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
     let isCommutable = Commutable;
   }
 
   // Vector operation, reg+mem.
-  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
                  !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
 
   // Intrinsic operation, reg+reg.
-  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
   // Intrinsic operation, reg+mem.
-  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, sdmem:$src2),
                      !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F64Int VR128:$src1,
                                                sse_load_f64:$src2))]>;
 
   // Vector intrinsic operation, reg+reg.
-  def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
   // Vector intrinsic operation, reg+mem.
-  def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+  def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
                      !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
+                     [(set VR128:$dst, (V2F64Int VR128:$src1,
+                                                 (memopv2f64 addr:$src2)))]>;
 }
 }
 
@@ -1442,7 +1497,7 @@
 def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvtps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
-                                            (load addr:$src)))]>;
+                                            (memop addr:$src)))]>;
 // SSE2 packed instructions with XS prefix
 def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1451,7 +1506,7 @@
 def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                           (load addr:$src)))]>,
+                                           (memop addr:$src)))]>,
                       XS, Requires<[HasSSE2]>;
 
 // SSE2 packed instructions with XD prefix
@@ -1462,7 +1517,7 @@
 def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (load addr:$src)))]>,
+                                          (memop addr:$src)))]>,
                      XD, Requires<[HasSSE2]>;
 
 def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -1471,14 +1526,14 @@
 def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
                           [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                             (load addr:$src)))]>;
+                                             (memop addr:$src)))]>;
 
 // SSE2 instructions without OpSize prefix
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
                      TB, Requires<[HasSSE2]>;
-def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins f64mem:$src),
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
@@ -1487,10 +1542,10 @@
 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
-def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins f128mem:$src),
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
-                                            (load addr:$src)))]>;
+                                            (memop addr:$src)))]>;
 
 // Match intrinsics which expect XMM operand(s).
 // Aliases for intrinsics
@@ -1594,7 +1649,7 @@
   // Vector intrinsic operation, mem
   def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
+                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
 }
 
 // Square root.
@@ -1663,13 +1718,17 @@
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
                     "cmp${cc}pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
-                                       VR128:$src, imm:$cc))]>;
+                                                        VR128:$src, imm:$cc))]>;
   def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, 
                   (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
                   "cmp${cc}pd\t{$src, $dst|$dst, $src}",
                   [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
-                                     (load addr:$src), imm:$cc))]>;
+                                                 (memop addr:$src), imm:$cc))]>;
 }
+def : Pat<(v2i64 (vsetcc (v2f64 VR128:$src1), VR128:$src2, cond:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, (SSE_CC_imm cond:$cc))>;
+def : Pat<(v2i64 (vsetcc (v2f64 VR128:$src1), (memop addr:$src2), cond:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, (SSE_CC_imm cond:$cc))>;
 
 // Shuffle and unpack instructions
 let Constraints = "$src1 = $dst" in {
@@ -1774,6 +1833,21 @@
                                         (bitconvert (memopv2i64 addr:$src2))))]>;
 }
 
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr,
+                             Intrinsic IntId, Intrinsic IntId2> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (memopv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+}
+
 /// PDI_binop_rm - Simple SSE2 binary operator.
 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         ValueType OpVT, bit Commutable = 0> {
@@ -1848,64 +1922,24 @@
 defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;
 
 
-defm PSLLW : PDI_binop_rm_int<0xF1, "psllw", int_x86_sse2_psll_w>;
-defm PSLLD : PDI_binop_rm_int<0xF2, "pslld", int_x86_sse2_psll_d>;
-defm PSLLQ : PDI_binop_rm_int<0xF3, "psllq", int_x86_sse2_psll_q>;
-
-defm PSRLW : PDI_binop_rm_int<0xD1, "psrlw", int_x86_sse2_psrl_w>;
-defm PSRLD : PDI_binop_rm_int<0xD2, "psrld", int_x86_sse2_psrl_d>;
-defm PSRLQ : PDI_binop_rm_int<0xD3, "psrlq", int_x86_sse2_psrl_q>;
-
-defm PSRAW : PDI_binop_rm_int<0xE1, "psraw", int_x86_sse2_psra_w>;
-defm PSRAD : PDI_binop_rm_int<0xE2, "psrad", int_x86_sse2_psra_d>;
-
-// Some immediate variants need to match a bit_convert.
-let Constraints = "$src1 = $dst" in {
-def PSLLWri : PDIi8<0x71, MRM6r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psllw\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1,
-                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
-def PSLLDri : PDIi8<0x72, MRM6r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "pslld\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1,
-                          (scalar_to_vector (i32 imm:$src2))))]>;
-def PSLLQri : PDIi8<0x73, MRM6r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psllq\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1,
-                      (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
-
-def PSRLWri : PDIi8<0x71, MRM2r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psrlw\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1,
-                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
-def PSRLDri : PDIi8<0x72, MRM2r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psrld\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1,
-                          (scalar_to_vector (i32 imm:$src2))))]>;
-def PSRLQri : PDIi8<0x73, MRM2r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psrlq\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1,
-                      (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
-
-def PSRAWri : PDIi8<0x71, MRM4r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psraw\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1,
-                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
-def PSRADri : PDIi8<0x72, MRM4r, (outs VR128:$dst),
-                                 (ins VR128:$src1, i32i8imm:$src2),
-                    "psrad\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1,
-                          (scalar_to_vector (i32 imm:$src2))))]>;
-}
-
-// PSRAQ doesn't exist in SSE[1-3].
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                               int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                               int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                               int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                               int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                               int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                               int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                               int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                               int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
 
 // 128-bit logical shifts.
 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
@@ -1925,6 +1959,12 @@
             (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
   def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
             (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
 }
 
 // Logical
@@ -1954,6 +1994,33 @@
 defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
 defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
 
+def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), VR128:$src2, SETEQ)),
+          (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), (memop addr:$src2), SETEQ)),
+          (PCMPEQBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), VR128:$src2, SETEQ)),
+          (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), (memop addr:$src2), SETEQ)),
+          (PCMPEQWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), VR128:$src2, SETEQ)),
+          (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), (memop addr:$src2), SETEQ)),
+          (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), VR128:$src2, SETGT)),
+          (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), (memop addr:$src2), SETGT)),
+          (PCMPGTBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), VR128:$src2, SETGT)),
+          (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), (memop addr:$src2), SETGT)),
+          (PCMPGTWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), VR128:$src2, SETGT)),
+          (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), (memop addr:$src2), SETGT)),
+          (PCMPGTDrm VR128:$src1, addr:$src2)>;
+
+
 // Pack instructions
 defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
@@ -2279,56 +2346,59 @@
 
 // Move to lower bits of a VR128 and zeroing upper bits.
 // Loading from memory automatically zeroing upper bits.
-let AddedComplexity = 20 in
-  def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                        "movsd\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst,
-                          (v2f64 (vector_shuffle immAllZerosV_bc,
-                                  (v2f64 (scalar_to_vector
-                                          (loadf64 addr:$src))),
-                                  MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in {
+def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                      "movsd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
+                                                 (loadf64 addr:$src))))))]>;
+
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
+}
 
 // movd / movq to XMM register zero-extends
 let AddedComplexity = 15 in {
 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (vector_shuffle immAllZerosV,
-                                 (v4i32 (scalar_to_vector GR32:$src)),
-                                 MOVL_shuffle_mask)))]>;
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))]>;
 // This is X86-64 only.
 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v2i64 (vector_shuffle immAllZerosV_bc,
-                                 (v2i64 (scalar_to_vector GR64:$src)),
-                                 MOVL_shuffle_mask)))]>;
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))]>;
 }
 
-// Handle the v2f64 form of 'MOVZQI2PQIrr' for PR2108.  FIXME: this would be
-// better written as a dag combine xform.
-let AddedComplexity = 15 in
-def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
-                                  (v2f64 (scalar_to_vector 
-                                       (f64 (bitconvert GR64:$src)))),
-                                  MOVL_shuffle_mask)),
-          (MOVZQI2PQIrr GR64:$src)>, Requires<[HasSSE2]>;
-          
-
 let AddedComplexity = 20 in {
 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v4i32 (vector_shuffle immAllZerosV,
-                                 (v4i32 (scalar_to_vector (loadi32 addr:$src))),
-                                 MOVL_shuffle_mask)))]>;
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))]>;
+
+def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
-                       (v2i64 (vector_shuffle immAllZerosV_bc,
-                              (v2i64 (scalar_to_vector (loadi64 addr:$src))),
-                              MOVL_shuffle_mask)))]>, XS,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))]>, XS,
                    Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 }
 
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
@@ -2336,19 +2406,20 @@
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
-                                             VR128:$src,
-                                             MOVL_shuffle_mask)))]>,
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
                       XS, Requires<[HasSSE2]>;
 
-let AddedComplexity = 20 in
+let AddedComplexity = 20 in {
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (v2i64 (vector_shuffle immAllZerosV_bc,
-                                             (memopv2i64 addr:$src),
-                                             MOVL_shuffle_mask)))]>,
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))]>,
                       XS, Requires<[HasSSE2]>;
 
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+            (MOVZPQILo2PQIrm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE3 Instructions
 //===----------------------------------------------------------------------===//
@@ -2400,7 +2471,7 @@
                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                         "addsubps\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
-                                           (load addr:$src2)))]>;
+                                           (memop addr:$src2)))]>;
   def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                        "addsubpd\t{$src2, $dst|$dst, $src2}",
@@ -2410,7 +2481,7 @@
                        (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                        "addsubpd\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
-                                          (load addr:$src2)))]>;
+                                          (memop addr:$src2)))]>;
 }
 
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -2425,7 +2496,7 @@
 class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
   : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>;
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
 class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
   : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -2433,7 +2504,7 @@
 class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
   : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>;
+      [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
 
 let Constraints = "$src1 = $dst" in {
   def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
@@ -2674,13 +2745,13 @@
 
 defm PHADDW      : SS3I_binop_rm_int_16<0x01, "phaddw",
                                         int_x86_ssse3_phadd_w,
-                                        int_x86_ssse3_phadd_w_128, 1>;
+                                        int_x86_ssse3_phadd_w_128>;
 defm PHADDD      : SS3I_binop_rm_int_32<0x02, "phaddd",
                                         int_x86_ssse3_phadd_d,
-                                        int_x86_ssse3_phadd_d_128, 1>;
+                                        int_x86_ssse3_phadd_d_128>;
 defm PHADDSW     : SS3I_binop_rm_int_16<0x03, "phaddsw",
                                         int_x86_ssse3_phadd_sw,
-                                        int_x86_ssse3_phadd_sw_128, 1>;
+                                        int_x86_ssse3_phadd_sw_128>;
 defm PHSUBW      : SS3I_binop_rm_int_16<0x05, "phsubw",
                                         int_x86_ssse3_phsub_w,
                                         int_x86_ssse3_phsub_w_128>;
@@ -2692,7 +2763,7 @@
                                         int_x86_ssse3_phsub_sw_128>;
 defm PMADDUBSW   : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
                                         int_x86_ssse3_pmadd_ub_sw,
-                                        int_x86_ssse3_pmadd_ub_sw_128, 1>;
+                                        int_x86_ssse3_pmadd_ub_sw_128>;
 defm PMULHRSW    : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
                                         int_x86_ssse3_pmul_hr_sw,
                                         int_x86_ssse3_pmul_hr_sw_128, 1>;
@@ -2717,7 +2788,7 @@
                              (int_x86_ssse3_palign_r
                               VR64:$src1, VR64:$src2,
                               imm:$src3))]>;
-  def PALIGNR64rm  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+  def PALIGNR64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
                            (ins VR64:$src1, i64mem:$src2, i16imm:$src3),
                            "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                            [(set VR64:$dst,
@@ -2733,7 +2804,7 @@
                              (int_x86_ssse3_palign_r_128
                               VR128:$src1, VR128:$src2,
                               imm:$src3))]>, OpSize;
-  def PALIGNR128rm : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+  def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
                            (ins VR128:$src1, i128mem:$src2, i32imm:$src3),
                            "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                            [(set VR128:$dst,
@@ -2793,12 +2864,12 @@
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
-                  (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
           (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
-                  (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
           (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE2]>;
 }
 
 // Splat v2f64 / v2i64
@@ -2903,33 +2974,66 @@
 let AddedComplexity = 20 in {
 // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
 // vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
-def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memopv4f32 addr:$src2),
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memop addr:$src2),
                   MOVLP_shuffle_mask)),
           (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
-def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memopv2f64 addr:$src2),
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memop addr:$src2),
                   MOVLP_shuffle_mask)),
           (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memopv4f32 addr:$src2),
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memop addr:$src2),
                   MOVHP_shuffle_mask)),
           (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
-def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memopv2f64 addr:$src2),
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memop addr:$src2),
                   MOVHP_shuffle_mask)),
           (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
 
-def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)),
+def : Pat<(v4i32 (vector_shuffle VR128:$src1,
+                                 (bc_v4i32 (memopv2i64 addr:$src2)),
                   MOVLP_shuffle_mask)),
           (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memopv2i64 addr:$src2),
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memop addr:$src2),
                   MOVLP_shuffle_mask)),
           (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)),
+def : Pat<(v4i32 (vector_shuffle VR128:$src1,
+                                 (bc_v4i32 (memopv2i64 addr:$src2)),
                   MOVHP_shuffle_mask)),
           (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
-def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memopv2i64 addr:$src2),
-                  MOVLP_shuffle_mask)),
-          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memop addr:$src2),
+                  MOVHP_shuffle_mask)),
+          (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
 }
 
+// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+// (store (vector_shuffle (load addr), v2, <0, 1, 4, 5>), addr) using MOVHPS
+def : Pat<(store (v4f32 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVLP_shuffle_mask)), addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVLP_shuffle_mask)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4f32 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVHP_shuffle_mask)), addr:$src1),
+          (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVHP_shuffle_mask)), addr:$src1),
+          (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(store (v4i32 (vector_shuffle
+                         (bc_v4i32 (memopv2i64 addr:$src1)), VR128:$src2,
+                         MOVLP_shuffle_mask)), addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVLP_shuffle_mask)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 (vector_shuffle
+                         (bc_v4i32 (memopv2i64 addr:$src1)), VR128:$src2,
+                         MOVHP_shuffle_mask)), addr:$src1),
+          (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (vector_shuffle (memop addr:$src1), VR128:$src2,
+                         MOVHP_shuffle_mask)), addr:$src1),
+          (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+
 let AddedComplexity = 15 in {
 // Setting the lowest element in the vector.
 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
@@ -2953,37 +3057,8 @@
 def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, VR128:$src,
            MOVL_shuffle_mask)),
           (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
-
-
-// FIXME: Temporary workaround since 2-wide shuffle is broken.
-def : Pat<(int_x86_sse2_movs_d  VR128:$src1, VR128:$src2),
-          (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2),
-          (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2),
-          (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3),
-          (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>,
-      Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3),
-          (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>,
-      Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2),
-          (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)),
-          (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2),
-          (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)),
-          (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2),
-          (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)),
-          (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2),
-          (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
-def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)),
-          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
 
 // Some special case pandn patterns.
 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
@@ -2997,13 +3072,13 @@
           (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
 
 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
-                  (memopv2i64 addr:$src2))),
+                  (memop addr:$src2))),
           (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
-                  (memopv2i64 addr:$src2))),
+                  (memop addr:$src2))),
           (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
-                  (memopv2i64 addr:$src2))),
+                  (memop addr:$src2))),
           (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
 
 // vector -> vector casts
@@ -3079,7 +3154,8 @@
                     (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V4F32Int (load addr:$src1),imm:$src2))]>,
+                    [(set VR128:$dst,
+                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
                     OpSize;
 
   // Intrinsic operation, reg.
@@ -3111,7 +3187,8 @@
                     (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V2F64Int (load addr:$src1),imm:$src2))]>,
+                    [(set VR128:$dst,
+                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
                     OpSize;
 }
 
@@ -3178,19 +3255,18 @@
                                        int_x86_sse41_pmaxud, 1>;
 defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
                                        int_x86_sse41_pmaxuw, 1>;
-defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq",
-                                       int_x86_sse41_pmuldq, 1>;
 
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                Intrinsic IntId128, bit Commutable = 0> {
+  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                                SDNode OpNode, Intrinsic IntId128,
+                                bit Commutable = 0> {
     def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2),
                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (OpNode (v4i32 VR128:$src1),
-                                                    VR128:$src2))]>, OpSize {
+                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+                                                   VR128:$src2))]>, OpSize {
       let isCommutable = Commutable;
     }
     def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
@@ -3204,17 +3280,19 @@
                    (ins VR128:$src1, i128mem:$src2),
                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                    [(set VR128:$dst,
-                     (OpNode VR128:$src1, (memopv4i32 addr:$src2)))]>, OpSize;
+                     (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
     def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                        (ins VR128:$src1, i128mem:$src2),
                        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                        [(set VR128:$dst,
-                        (IntId128 VR128:$src1, (memopv4i32 addr:$src2)))]>,
+                        (IntId128 VR128:$src1, (memop addr:$src2)))]>,
                        OpSize;
   }
 }
-defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", mul,
+defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
                                        int_x86_sse41_pmulld, 1>;
+defm PMULDQ       : SS41I_binop_patint<0x28, "pmuldq", v2i64, mul,
+                                       int_x86_sse41_pmuldq, 1>;
 
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
@@ -3252,7 +3330,7 @@
 defm DPPD         : SS41I_binop_rmi_int<0x41, "dppd",
                                         int_x86_sse41_dppd, 1>;
 defm MPSADBW      : SS41I_binop_rmi_int<0x42, "mpsadbw",
-                                        int_x86_sse41_mpsadbw, 0>;
+                                        int_x86_sse41_mpsadbw, 1>;
 
 
 /// SS41I_ternary_int - SSE 4.1 ternary operator

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.cpp Sun Jul  6 15:45:41 2008
@@ -15,20 +15,109 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "X86IntelAsmPrinter.h"
+#include "X86InstrInfo.h"
 #include "X86TargetAsmInfo.h"
 #include "X86.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData *TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+
+  unsigned argNum = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI, ++argNum) {
+    const Type* Ty = AI->getType();
+
+    // 'Dereference' type in case of byval parameter attribute
+    if (F->paramHasAttr(argNum, ParamAttr::ByVal))
+      Ty = cast<PointerType>(Ty)->getElementType();
+
+    // Size should be aligned to DWORD boundary
+    Size += ((TD->getABITypeSize(Ty) + 3)/4)*4;
+  }
+
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86IntelAsmPrinter::decorateName(std::string &Name,
+                                      const GlobalValue *GV) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  unsigned CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+  const X86MachineFunctionInfo *Info;
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+
+  const FunctionType *FT = F->getFunctionType();
+  switch (Info->getDecorationStyle()) {
+  case None:
+    break;
+  case StdCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+    break;
+  case FastCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+    if (Name[0] == '_')
+      Name[0] = '@';
+    else
+      Name = '@' + Name;
+
+    break;
+  default:
+    assert(0 && "Unsupported DecorationStyle");
+  }
+}
+
+
 std::string X86IntelAsmPrinter::getSectionForFunction(const Function &F) const {
   // Intel asm always emits functions to _text.
   return "_text";
@@ -53,7 +142,7 @@
   if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
     FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
 
-  X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+  decorateName(CurrentFnName, F);
 
   SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
 
@@ -62,18 +151,18 @@
   default: assert(0 && "Unsupported linkage type!");
   case Function::InternalLinkage:
     EmitAlignment(FnAlign);
-    break;    
+    break;
   case Function::DLLExportLinkage:
     DLLExportedFns.insert(CurrentFnName);
     //FALLS THROUGH
   case Function::ExternalLinkage:
     O << "\tpublic " << CurrentFnName << "\n";
     EmitAlignment(FnAlign);
-    break;    
+    break;
   }
-  
+
   O << CurrentFnName << "\tproc near\n";
-  
+
   // Print out code for the function.
   for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
        I != E; ++I) {
@@ -113,14 +202,14 @@
   }
 }
 
-void X86IntelAsmPrinter::printOp(const MachineOperand &MO, 
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
                                  const char *Modifier) {
   switch (MO.getType()) {
-  case MachineOperand::MO_Register: {      
+  case MachineOperand::MO_Register: {
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
       unsigned Reg = MO.getReg();
       if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-        MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ?
+        MVT VT = (strcmp(Modifier,"subreg64") == 0) ?
           MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
                       ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
         Reg = getX86SubSuperRegister(Reg, VT);
@@ -142,7 +231,7 @@
     O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << "_" << MO.getIndex();
     return;
-  }    
+  }
   case MachineOperand::MO_ConstantPoolIndex: {
     bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
     if (!isMemOp) O << "OFFSET ";
@@ -159,17 +248,17 @@
   case MachineOperand::MO_GlobalAddress: {
     bool isCallOp = Modifier && !strcmp(Modifier, "call");
     bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
-    GlobalValue *GV = MO.getGlobal();    
+    GlobalValue *GV = MO.getGlobal();
     std::string Name = Mang->getValueName(GV);
 
-    X86SharedAsmPrinter::decorateName(Name, GV);
+    decorateName(Name, GV);
 
     if (!isMemOp && !isCallOp) O << "OFFSET ";
     if (GV->hasDLLImportLinkage()) {
       // FIXME: This should be fixed with full support of stdcall & fastcall
       // CC's
-      O << "__imp_";          
-    } 
+      O << "__imp_";
+    }
     O << Name;
     int Offset = MO.getOffset();
     if (Offset > 0)
@@ -235,11 +324,11 @@
   O << "]";
 }
 
-void X86IntelAsmPrinter::printPICJumpTableSetLabel(unsigned uid, 
+void X86IntelAsmPrinter::printPICJumpTableSetLabel(unsigned uid,
                                            const MachineBasicBlock *MBB) const {
   if (!TAI->getSetDirective())
     return;
-  
+
   O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
     << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
   printBasicBlockLabel(MBB, false, false, false);
@@ -277,12 +366,12 @@
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
 bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                         unsigned AsmVariant, 
+                                         unsigned AsmVariant,
                                          const char *ExtraCode) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
-    
+
     switch (ExtraCode[0]) {
     default: return true;  // Unknown modifier.
     case 'b': // Print QImode register
@@ -292,14 +381,14 @@
       return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
     }
   }
-  
+
   printOperand(MI, OpNo);
   return false;
 }
 
 bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                                unsigned OpNo,
-                                               unsigned AsmVariant, 
+                                               unsigned AsmVariant,
                                                const char *ExtraCode) {
   if (ExtraCode && ExtraCode[0])
     return true; // Unknown modifier.
@@ -318,8 +407,8 @@
 }
 
 bool X86IntelAsmPrinter::doInitialization(Module &M) {
-  bool Result = X86SharedAsmPrinter::doInitialization(M);
-  
+  bool Result = AsmPrinter::doInitialization(M);
+
   Mang->markCharUnacceptable('.');
 
   O << "\t.686\n\t.model flat\n\n";
@@ -328,15 +417,15 @@
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     if (I->isDeclaration()) {
       std::string Name = Mang->getValueName(I);
-      X86SharedAsmPrinter::decorateName(Name, I);
+      decorateName(Name, I);
 
       O << "\textern " ;
       if (I->hasDLLImportLinkage()) {
         O << "__imp_";
-      }      
+      }
       O << Name << ":near\n";
     }
-  
+
   // Emit declarations for external globals.  Note that VC++ always declares
   // external globals to have type byte, and if that's good enough for VC++...
   for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
@@ -347,7 +436,7 @@
       O << "\textern " ;
       if (I->hasDLLImportLinkage()) {
         O << "__imp_";
-      }      
+      }
       O << Name << ":byte\n";
     }
   }
@@ -362,17 +451,18 @@
   for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
     if (I->isDeclaration()) continue;   // External global require no code
-    
+
     // Check to see if this is a special global used by LLVM, if so, emit it.
     if (EmitSpecialLLVMGlobal(I))
       continue;
-    
+
     std::string name = Mang->getValueName(I);
     Constant *C = I->getInitializer();
     unsigned Align = TD->getPreferredAlignmentLog(I);
     bool bCustomSegment = false;
 
     switch (I->getLinkage()) {
+    case GlobalValue::CommonLinkage:
     case GlobalValue::LinkOnceLinkage:
     case GlobalValue::WeakLinkage:
       SwitchToDataSection("");
@@ -414,8 +504,7 @@
   }
 
     // Output linker support code for dllexported globals
-  if (!DLLExportedGVs.empty() ||
-      !DLLExportedFns.empty()) {
+  if (!DLLExportedGVs.empty() || !DLLExportedFns.empty()) {
     SwitchToDataSection("");
     O << "; WARNING: The following code is valid only with MASM v8.x and (possible) higher\n"
       << "; This version of MASM is usually shipped with Microsoft Visual Studio 2005\n"
@@ -424,23 +513,19 @@
     O << "_drectve\t segment info alias('.drectve')\n";
   }
 
-  for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+  for (StringSet<>::iterator i = DLLExportedGVs.begin(),
          e = DLLExportedGVs.end();
-         i != e; ++i) {
-    O << "\t db ' /EXPORT:" << *i << ",data'\n";
-  }    
+         i != e; ++i)
+    O << "\t db ' /EXPORT:" << i->getKeyData() << ",data'\n";
 
-  for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+  for (StringSet<>::iterator i = DLLExportedFns.begin(),
          e = DLLExportedFns.end();
-         i != e; ++i) {
-    O << "\t db ' /EXPORT:" << *i << "'\n";
-  }    
-
-  if (!DLLExportedGVs.empty() ||
-      !DLLExportedFns.empty()) {
-    O << "_drectve\t ends\n";    
-  }
-  
+         i != e; ++i)
+    O << "\t db ' /EXPORT:" << i->getKeyData() << "'\n";
+
+  if (!DLLExportedGVs.empty() || !DLLExportedFns.empty())
+    O << "_drectve\t ends\n";
+
   // Bypass X86SharedAsmPrinter::doFinalization().
   bool Result = AsmPrinter::doFinalization(M);
   SwitchToDataSection("");

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86IntelAsmPrinter.h Sun Jul  6 15:45:41 2008
@@ -14,16 +14,19 @@
 #ifndef X86INTELASMPRINTER_H
 #define X86INTELASMPRINTER_H
 
-#include "X86AsmPrinter.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
-struct VISIBILITY_HIDDEN X86IntelAsmPrinter : public X86SharedAsmPrinter {
+struct VISIBILITY_HIDDEN X86IntelAsmPrinter : public AsmPrinter {
   X86IntelAsmPrinter(std::ostream &O, X86TargetMachine &TM,
                      const TargetAsmInfo *T)
-      : X86SharedAsmPrinter(O, TM, T) {
+      : AsmPrinter(O, TM, T) {
   }
 
   virtual const char *getPassName() const {
@@ -110,12 +113,31 @@
   bool runOnMachineFunction(MachineFunction &F);
   bool doInitialization(Module &M);
   bool doFinalization(Module &M);
-  
+
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+
+  void decorateName(std::string& Name, const GlobalValue* GV);
+
   /// getSectionForFunction - Return the section that we should emit the
   /// specified function body into.
   virtual std::string getSectionForFunction(const Function &F) const;
 
   virtual void EmitString(const ConstantArray *CVA) const;
+
+  // Necessary for dllexport support
+  StringSet<> DLLExportedFns, DLLExportedGVs;
 };
 
 } // end namespace llvm

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86JITInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86JITInfo.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86JITInfo.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86JITInfo.cpp Sun Jul  6 15:45:41 2008
@@ -51,6 +51,13 @@
 #define GETASMPREFIX(X) GETASMPREFIX2(X)
 #define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
 
+// Check if building with -fPIC
+#if defined(__PIC__) && __PIC__ && defined(__linux__)
+#define ASMCALLSUFFIX "@PLT"
+#else
+#define ASMCALLSUFFIX
+#endif
+
 // Provide a convenient way for disabling usage of CFI directives.
 // This is needed for old/broken assemblers (for example, gas on
 // Darwin is pretty old and doesn't support these directives)
@@ -112,7 +119,7 @@
     // JIT callee
     "movq    %rbp, %rdi\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rsi\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
     // Restore all XMM arg registers
     "movaps  112(%rsp), %xmm7\n"
     "movaps  96(%rsp), %xmm6\n"
@@ -186,7 +193,7 @@
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
     "movl    %ebp, %esp\n"    // Restore ESP
     CFI(".cfi_def_cfa_register %esp\n")
     "subl    $12, %esp\n"
@@ -240,7 +247,7 @@
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
     "addl    $16, %esp\n"
     "movaps  48(%esp), %xmm3\n"
     CFI(".cfi_restore %xmm3\n")
@@ -396,8 +403,8 @@
                                          MachineCodeEmitter &MCE) {
 #if defined (X86_64_JIT)
   MCE.startFunctionStub(GV, 8, 8);
-  MCE.emitWordLE(((unsigned *)&ptr)[0]);
-  MCE.emitWordLE(((unsigned *)&ptr)[1]);
+  MCE.emitWordLE((unsigned)(intptr_t)ptr);
+  MCE.emitWordLE((unsigned)(((intptr_t)ptr) >> 32));
 #else
   MCE.startFunctionStub(GV, 4, 4);
   MCE.emitWordLE((intptr_t)ptr);
@@ -420,8 +427,8 @@
     MCE.startFunctionStub(F, 13, 4);
     MCE.emitByte(0x49);          // REX prefix
     MCE.emitByte(0xB8+2);        // movabsq r10
-    MCE.emitWordLE(((unsigned *)&Fn)[0]);
-    MCE.emitWordLE(((unsigned *)&Fn)[1]);
+    MCE.emitWordLE((unsigned)(intptr_t)Fn);
+    MCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
     MCE.emitByte(0x41);          // REX prefix
     MCE.emitByte(0xFF);          // jmpq *r10
     MCE.emitByte(2 | (4 << 3) | (3 << 6));
@@ -437,8 +444,8 @@
   MCE.startFunctionStub(F, 14, 4);
   MCE.emitByte(0x49);          // REX prefix
   MCE.emitByte(0xB8+2);        // movabsq r10
-  MCE.emitWordLE(((unsigned *)&Fn)[0]);
-  MCE.emitWordLE(((unsigned *)&Fn)[1]);
+  MCE.emitWordLE((unsigned)(intptr_t)Fn);
+  MCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
   MCE.emitByte(0x41);          // REX prefix
   MCE.emitByte(0xFF);          // callq *r10
   MCE.emitByte(2 | (2 << 3) | (3 << 6));

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86MachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86MachineFunctionInfo.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86MachineFunctionInfo.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86MachineFunctionInfo.h Sun Jul  6 15:45:41 2008
@@ -53,20 +53,27 @@
   /// the returnaddr can be savely move to this area
   int TailCallReturnAddrDelta;
 
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
                              CalleeSavedFrameSize(0),
                              BytesToPopOnReturn(0),
                              DecorationStyle(None),
                              ReturnAddrIndex(0),
-                             TailCallReturnAddrDelta(0) {}
+                             TailCallReturnAddrDelta(0),
+                             SRetReturnReg(0) {}
   
   X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
                                                 CalleeSavedFrameSize(0),
                                                 BytesToPopOnReturn(0),
                                                 DecorationStyle(None),
                                                 ReturnAddrIndex(0),
-                                                TailCallReturnAddrDelta(0) {}
+                                                TailCallReturnAddrDelta(0),
+                                                SRetReturnReg(0) {}
   
   bool getForceFramePointer() const { return ForceFramePointer;} 
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -85,6 +92,9 @@
 
   int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
   void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 };
 } // End llvm namespace
 

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.cpp Sun Jul  6 15:45:41 2008
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -36,6 +37,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
@@ -249,6 +251,19 @@
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
+static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
+  unsigned MaxAlign = 0;
+  for (int i = FFI->getObjectIndexBegin(),
+         e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    unsigned Align = FFI->getObjectAlignment(i);
+    MaxAlign = std::max(MaxAlign, Align);
+  }
+
+  return MaxAlign;
+}
+
 // hasFP - Return true if the specified function should have a dedicated frame
 // pointer register.  This is true if the function has variable sized allocas or
 // if frame pointer elimination is disabled.
@@ -257,16 +272,59 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
 
-  return (NoFramePointerElim || 
+  return (NoFramePointerElim ||
+          needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           (MMI && MMI->callsUnwindInit()));
 }
 
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();;
+
+  // FIXME: Currently we don't support stack realignment for functions with
+  // variable-sized allocas
+  return (RealignStack &&
+          (MFI->getMaxAlignment() > StackAlign &&
+           !MFI->hasVarSizedObjects()));
+}
+
 bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
+int
+X86RegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
+  int Offset = MF.getFrameInfo()->getObjectOffset(FI) + SlotSize;
+  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+
+  if (needsStackRealignment(MF)) {
+    if (FI < 0)
+      // Skip the saved EBP
+      Offset += SlotSize;
+    else {
+      unsigned Align = MF.getFrameInfo()->getObjectAlignment(FI);
+      assert( (-(Offset + StackSize)) % Align == 0);
+      return Offset + StackSize;
+    }
+
+    // FIXME: Support tail calls
+  } else {
+    if (!hasFP(MF))
+      return Offset + StackSize;
+
+    // Skip the saved EBP
+    Offset += SlotSize;
+
+    // Skip the RETADDR move area
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset;
+}
+
 void X86RegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -333,28 +391,38 @@
   }
 
   int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr;
+  if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else
+    BasePtr = (hasFP(MF) ? FramePtr : StackPtr);
+
   // This must be part of a four operand memory reference.  Replace the
   // FrameIndex with base register with EBP.  Add an offset to the offset.
-  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false);
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
 
   // Now add the frame object offset to the offset from EBP.
-  int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-                   MI.getOperand(i+3).getImm()+SlotSize;
+  int64_t Offset = getFrameIndexOffset(MF, FrameIndex) +
+                   MI.getOperand(i+3).getImm();
 
-  if (!hasFP(MF))
-    Offset += MF.getFrameInfo()->getStackSize();
-  else {
-    Offset += SlotSize;  // Skip the saved EBP
-    // Skip the RETADDR move area
-    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-    if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
-  }
-  
   MI.getOperand(i+3).ChangeToImmediate(Offset);
 }
 
 void
+X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  unsigned MaxAlign = std::max(FFI->getMaxAlignment(),
+                               calculateMaxStackAlignment(FFI));
+
+  FFI->setMaxAlignment(MaxAlign);
+}
+
+void
 X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -413,7 +481,7 @@
 void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                       unsigned StackPtr, uint64_t *NumBytes = NULL) {
   if (MBBI == MBB.begin()) return;
-  
+
   MachineBasicBlock::iterator PI = prior(MBBI);
   unsigned Opc = PI->getOpcode();
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
@@ -437,12 +505,12 @@
                         MachineBasicBlock::iterator &MBBI,
                         unsigned StackPtr, uint64_t *NumBytes = NULL) {
   return;
-  
+
   if (MBBI == MBB.end()) return;
-  
+
   MachineBasicBlock::iterator NI = next(MBBI);
   if (NI == MBB.end()) return;
-  
+
   unsigned Opc = NI->getOpcode();
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
@@ -462,12 +530,12 @@
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB instruction it is deleted 
+/// instruction. If it is an ADD/SUB instruction it is deleted
 /// argument and the stack adjustment is returned as a positive value for ADD
-/// and a negative for SUB. 
+/// and a negative for SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI,
-                           unsigned StackPtr,                     
+                           unsigned StackPtr,
                            bool doMergeWithPrevious) {
 
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
@@ -491,11 +559,85 @@
     Offset -= PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!doMergeWithPrevious) MBBI = NI;
-  }   
+  }
 
   return Offset;
 }
 
+void X86RegisterInfo::emitFrameMoves(MachineFunction &MF,
+                                     unsigned FrameLabelId,
+                                     unsigned ReadyLabelId) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  if (!MMI)
+    return;
+
+  uint64_t StackSize = MFI->getStackSize();
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  const TargetData *TD = MF.getTarget().getTargetData();
+
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth =
+    (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+     TargetFrameInfo::StackGrowsUp ?
+     TD->getPointerSize() : -TD->getPointerSize());
+
+  if (StackSize) {
+    // Show update of SP.
+    if (hasFP(MF)) {
+      // Adjust SP
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    } else {
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP,
+                            -StackSize+stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+  } else {
+    //FIXME: Verify & implement for FP
+    MachineLocation SPDst(StackPtr);
+    MachineLocation SPSrc(StackPtr, stackGrowth);
+    Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+  }
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  // FIXME: This is dirty hack. The code itself is pretty mess right now.
+  // It should be rewritten from scratch and generalized sometimes.
+
+  // Determine maximum offset (minumum due to stack growth)
+  int64_t MaxOffset = 0;
+  for (unsigned I = 0, E = CSI.size(); I!=E; ++I)
+    MaxOffset = std::min(MaxOffset,
+                         MFI->getObjectOffset(CSI[I].getFrameIdx()));
+
+  // Calculate offsets
+  int64_t saveAreaOffset = (hasFP(MF) ? 3 : 2)*stackGrowth;
+  for (unsigned I = 0, E = CSI.size(); I!=E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+    unsigned Reg = CSI[I].getReg();
+    Offset = (MaxOffset-Offset+saveAreaOffset);
+    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+    MachineLocation CSSrc(Reg);
+    Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+  }
+
+  if (hasFP(MF)) {
+    // Save FP
+    MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+    MachineLocation FPSrc(FramePtr);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+  MachineLocation FPSrc(MachineLocation::VirtualFP);
+  Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+}
+
+
 void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -504,57 +646,72 @@
   MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) || 
+  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
                           !Fn->doesNotThrow() ||
                           UnwindTablesMandatory;
-  
   // Prepare for frame info.
   unsigned FrameLabelId = 0;
-  
+
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
+  // Get desired stack alignment
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+
   // Add RETADDR move area to callee saved frame size.
   int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-  if (TailCallReturnAddrDelta < 0)  
+  if (TailCallReturnAddrDelta < 0)
     X86FI->setCalleeSavedFrameSize(
           X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));
-  uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
 
   // Insert stack pointer adjustment for later moving of return addr.  Only
   // applies to tail call optimized functions where the callee argument stack
   // size is bigger than the callers.
   if (TailCallReturnAddrDelta < 0) {
-    BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri), 
+    BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
             StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
   }
 
+  uint64_t NumBytes = 0;
   if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
     // Get the offset of the stack slot for the EBP register... which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
     // Update the frame offset adjustment.
-    MFI->setOffsetAdjustment(SlotSize-NumBytes);
+    MFI->setOffsetAdjustment(-NumBytes);
 
     // Save EBP into the appropriate stack slot...
     BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(FramePtr);
-    NumBytes -= SlotSize;
 
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
       FrameLabelId = MMI->NextLabelID();
-      BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(FrameLabelId).addImm(0);
+      BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
     }
 
     // Update EBP with the new base value...
     BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
       .addReg(StackPtr);
-  }
-  
+
+    // Realign stack
+    if (needsStackRealignment(MF))
+      BuildMI(MBB, MBBI,
+              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
+              StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+  } else
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+
   unsigned ReadyLabelId = 0;
   if (needsFrameMoves) {
     // Mark effective beginning of when frame pointer is ready.
     ReadyLabelId = MMI->NextLabelID();
-    BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(ReadyLabelId).addImm(0);
+    BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
   }
 
   // Skip the callee-saved push instructions.
@@ -575,8 +732,8 @@
                       Reg == X86::AH || Reg == X86::AL);
       }
 
-      // Function prologue calls _alloca to probe the stack when allocating  
-      // more than 4k bytes in one go. Touching the stack at 4K increments is  
+      // Function prologue calls _alloca to probe the stack when allocating
+      // more than 4k bytes in one go. Touching the stack at 4K increments is
       // necessary to ensure that the guard pages used by the OS virtual memory
       // manager are allocated in correct sequence.
       if (!isEAXAlive) {
@@ -593,7 +750,7 @@
           .addExternalSymbol("_alloca");
         // Restore EAX
         MachineInstr *MI = addRegOffset(BuildMI(TII.get(X86::MOV32rm),X86::EAX),
-                                        StackPtr, NumBytes-4);
+                                        StackPtr, false, NumBytes-4);
         MBB.insert(MBBI, MI);
       }
     } else {
@@ -604,95 +761,20 @@
       // If there is an ADD32ri or SUB32ri of ESP immediately after this
       // instruction, merge the two instructions.
       mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
-      
+
       if (NumBytes)
         emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
     }
   }
 
-  if (needsFrameMoves) {
-    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
-    const TargetData *TD = MF.getTarget().getTargetData();
-
-    // Calculate amount of bytes used for return address storing
-    int stackGrowth =
-      (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
-       TargetFrameInfo::StackGrowsUp ?
-       TD->getPointerSize() : -TD->getPointerSize());
-
-    if (StackSize) {
-      // Show update of SP.
-      if (hasFP(MF)) {
-        // Adjust SP
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
-        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-      } else {
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP,
-                              -StackSize+stackGrowth);
-        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-      }
-    } else {
-      //FIXME: Verify & implement for FP
-      MachineLocation SPDst(StackPtr);
-      MachineLocation SPSrc(StackPtr, stackGrowth);
-      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-    }
-            
-    // Add callee saved registers to move list.
-    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-    // FIXME: This is dirty hack. The code itself is pretty mess right now.
-    // It should be rewritten from scratch and generalized sometimes.
-    
-    // Determine maximum offset (minumum due to stack growth)
-    int64_t MaxOffset = 0;
-    for (unsigned I = 0, E = CSI.size(); I!=E; ++I)
-      MaxOffset = std::min(MaxOffset,
-                           MFI->getObjectOffset(CSI[I].getFrameIdx()));
-
-    // Calculate offsets
-    int64_t saveAreaOffset = (hasFP(MF) ? 3 : 2)*stackGrowth;
-    for (unsigned I = 0, E = CSI.size(); I!=E; ++I) {
-      int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
-      unsigned Reg = CSI[I].getReg();
-      Offset = (MaxOffset-Offset+saveAreaOffset);
-      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-      MachineLocation CSSrc(Reg);
-      Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
-    }
-    
-    if (hasFP(MF)) {
-      // Save FP
-      MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
-      MachineLocation FPSrc(FramePtr);
-      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
-    }
-    
-    MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
-    MachineLocation FPSrc(MachineLocation::VirtualFP);
-    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
-  }
-
-  // If it's main() on Cygwin\Mingw32 we should align stack as well
-  if (Fn->hasExternalLinkage() && Fn->getName() == "main" &&
-      Subtarget->isTargetCygMing()) {
-    BuildMI(MBB, MBBI, TII.get(X86::AND32ri), X86::ESP)
-                .addReg(X86::ESP).addImm(-StackAlign);
-
-    // Probe the stack
-    BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(StackAlign);
-    BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)).addExternalSymbol("_alloca");
-  }
+  if (needsFrameMoves)
+    emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
 }
 
 void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function* Fn = MF.getFunction();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
   unsigned RetOpcode = MBBI->getOpcode();
 
@@ -713,16 +795,25 @@
 
   // Get the number of bytes to allocate from the FrameInfo
   uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
-  uint64_t NumBytes = StackSize - CSSize;
+  uint64_t NumBytes = 0;
 
   if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+    NumBytes = FrameSize - CSSize;
+
     // pop EBP.
     BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
-    NumBytes -= SlotSize;
-  }
+  } else
+    NumBytes = StackSize - CSSize;
 
   // Skip the callee-saved pop instructions.
+  MachineBasicBlock::iterator LastCSPop = MBBI;
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
@@ -738,35 +829,42 @@
     mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
 
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
-  // slot before popping them off!  Also, if it's main() on Cygwin/Mingw32 we
-  // aligned stack in the prologue, - revert stack changes back. Note: we're
-  // assuming, that frame pointer was forced for main()
-  if (MFI->hasVarSizedObjects() ||
-      (Fn->hasExternalLinkage() && Fn->getName() == "main" &&
-       Subtarget->isTargetCygMing())) {
-    unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned
+  if (needsStackRealignment(MF)) {
+    // We cannot use LEA here, because stack pointer was realigned. We need to
+    // deallocate local frame back
+    if (CSSize) {
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+      MBBI = prior(LastCSPop);
+    }
+
+    BuildMI(MBB, MBBI,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(FramePtr);
+  } else if (MFI->hasVarSizedObjects()) {
     if (CSSize) {
+      unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
       MachineInstr *MI = addRegOffset(BuildMI(TII.get(Opc), StackPtr),
-                                      FramePtr, -CSSize);
+                                      FramePtr, false, -CSSize);
       MBB.insert(MBBI, MI);
     } else
       BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
         addReg(FramePtr);
 
-    NumBytes = 0;
+  } else {
+    // adjust stack pointer back: ESP += numbytes
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
   }
 
-  // adjust stack pointer back: ESP += numbytes
-  if (NumBytes)
-    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
-
   // We're returning from function via eh_return.
   if (RetOpcode == X86::EH_RETURN) {
     MBBI = prior(MBB.end());
     MachineOperand &DestAddr  = MBBI->getOperand(0);
     assert(DestAddr.isRegister() && "Offset should be in register!");
     BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
-      addReg(DestAddr.getReg()); 
+      addReg(DestAddr.getReg());
   // Tail call return: adjust the stack pointer and jump to callee
   } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
              RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
@@ -774,7 +872,7 @@
     MachineOperand &JumpTarget = MBBI->getOperand(0);
     MachineOperand &StackAdjust = MBBI->getOperand(1);
     assert( StackAdjust.isImmediate() && "Expecting immediate value.");
-    
+
     // Adjust stack pointer.
     int StackAdj = StackAdjust.getImm();
     int MaxTCDelta = X86FI->getTCReturnAddrDelta();
@@ -787,7 +885,7 @@
       // Check for possible merge with preceeding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
       emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
-    } 
+    }
     // Jump to label or value in register.
     if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64)
       BuildMI(MBB, MBBI, TII.get(X86::TAILJMPd)).
@@ -798,7 +896,7 @@
        BuildMI(MBB, MBBI, TII.get(X86::TAILJMPr), JumpTarget.getReg());
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
-  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) && 
+  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
              (X86FI->getTCReturnAddrDelta() < 0)) {
     // Add the return addr area delta back since we are not tail calling.
     int delta = -1*X86FI->getTCReturnAddrDelta();
@@ -820,20 +918,6 @@
   return hasFP(MF) ? FramePtr : StackPtr;
 }
 
-int
-X86RegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
-  int Offset = MF.getFrameInfo()->getObjectOffset(FI) + SlotSize;
-  if (!hasFP(MF))
-    return Offset + MF.getFrameInfo()->getStackSize();
-
-  Offset += SlotSize;  // Skip the saved EBP
-  // Skip the RETADDR move area
-  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-  if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
-  return Offset;
-}
-
 void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
                                                                          const {
   // Calculate amount of bytes used for return address storing
@@ -861,8 +945,8 @@
 }
 
 namespace llvm {
-unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) {
-  switch (VT) {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT VT, bool High) {
+  switch (VT.getSimpleVT()) {
   default: return Reg;
   case MVT::i8:
     if (High) {
@@ -1030,3 +1114,37 @@
 
 #include "X86GenRegisterInfo.inc"
 
+namespace {
+  struct VISIBILITY_HIDDEN MSAC : public MachineFunctionPass {
+    static char ID;
+    MSAC() : MachineFunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      MachineRegisterInfo &RI = MF.getRegInfo();
+
+      // Calculate max stack alignment of all already allocated stack objects.
+      unsigned MaxAlign = calculateMaxStackAlignment(FFI);
+
+      // Be over-conservative: scan over all vreg defs and find, whether vector
+      // registers are used. If yes - there is probability, that vector register
+      // will be spilled and thus stack needs to be aligned properly.
+      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+           RegNum < RI.getLastVirtReg(); ++RegNum)
+        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
+
+      FFI->setMaxAlignment(MaxAlign);
+
+      return false;
+    }
+
+    virtual const char *getPassName() const {
+      return "X86 Maximal Stack Alignment Calculator";
+    }
+  };
+
+  char MSAC::ID = 0;
+}
+
+FunctionPass*
+llvm::createX86MaxStackAlignmentCalculatorPass() { return new MSAC(); }

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.h Sun Jul  6 15:45:41 2008
@@ -115,6 +115,8 @@
 
   bool hasFP(const MachineFunction &MF) const;
 
+  bool needsStackRealignment(const MachineFunction &MF) const;
+
   bool hasReservedCallFrame(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
@@ -125,10 +127,15 @@
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
 
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void emitFrameMoves(MachineFunction &MF,
+                      unsigned FrameLabelId, unsigned ReadyLabelId) const;
+
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(MachineFunction &MF) const;
@@ -143,7 +150,7 @@
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
 // register of a specific X86 register.
 // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
-unsigned getX86SubSuperRegister(unsigned, MVT::ValueType, bool High=false);
+unsigned getX86SubSuperRegister(unsigned, MVT, bool High=false);
 
 } // End llvm namespace
 

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.td?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.td (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86RegisterInfo.td Sun Jul  6 15:45:41 2008
@@ -509,7 +509,7 @@
 }
 
 // Generic vector registers: VR64 and VR128.
-def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
                           [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
                           [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.cpp Sun Jul  6 15:45:41 2008
@@ -16,9 +16,10 @@
 #include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-cl::opt<X86Subtarget::AsmWriterFlavorTy>
+static cl::opt<X86Subtarget::AsmWriterFlavorTy>
 AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
   cl::desc("Choose style of code to emit from X86 backend:"),
   cl::values(
@@ -26,10 +27,6 @@
     clEnumValN(X86Subtarget::Intel, "intel", "  Emit Intel-style assembly"),
     clEnumValEnd));
 
-cl::opt<unsigned>
-StackAlignment("stack-alignment", cl::init(0),
-               cl::desc("Override default stack alignment"));
-
 
 /// True if accessing the GV requires an extra load. For Windows, dllimported
 /// symbols are indirect, loading the value at address GV rather then the
@@ -44,11 +41,15 @@
     if (isTargetDarwin()) {
       return (!isDirectCall &&
               (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+               GV->hasCommonLinkage() ||
                (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode())));
     } else if (isTargetELF()) {
-      // Extra load is needed for all non-statics.
-      return (!isDirectCall &&
-              (GV->isDeclaration() || !GV->hasInternalLinkage()));
+      // Extra load is needed for all externally visible.
+      if (isDirectCall)
+        return false;
+      if (GV->hasInternalLinkage() || GV->hasHiddenVisibility())
+        return false;
+      return true;
     } else if (isTargetCygMing() || isTargetWindows()) {
       return (GV->hasDLLImportLinkage());
     }
@@ -244,12 +245,13 @@
   , X863DNowLevel(NoThreeDNow)
   , HasX86_64(false)
   , DarwinVers(0)
+  , IsLinux(false)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
   , MaxInlineSizeThreshold(128)
   , Is64Bit(is64Bit)
   , TargetType(isELF) { // Default to ELF unless otherwise specified.
-
+    
   // Determine default and user specified characteristics
   if (!FS.empty()) {
     // If feature string is not empty, parse features string.
@@ -281,6 +283,10 @@
         DarwinVers = atoi(&TT[Pos+7]);
       else
         DarwinVers = 8;  // Minimum supported darwin is Tiger.
+    } else if (TT.find("linux") != std::string::npos) {
+      // Linux doesn't imply ELF, but we don't currently support anything else.
+      TargetType = isELF;
+      IsLinux = true;
     } else if (TT.find("cygwin") != std::string::npos) {
       TargetType = isCygwin;
     } else if (TT.find("mingw") != std::string::npos) {
@@ -305,6 +311,10 @@
     
 #elif defined(_WIN32) || defined(_WIN64)
     TargetType = isWindows;
+#elif defined(__linux__)
+    // Linux doesn't imply ELF, but we don't currently support anything else.
+    TargetType = isELF;
+    IsLinux = true;
 #endif
   }
 
@@ -315,11 +325,9 @@
       ? X86Subtarget::Intel : X86Subtarget::ATT;
   }
 
-  if (TargetType == isDarwin ||
-      TargetType == isCygwin ||
-      TargetType == isMingw  ||
-      TargetType == isWindows ||
-      (TargetType == isELF && Is64Bit))
+  // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
+  // bit targets.
+  if (TargetType == isDarwin || Is64Bit)
     stackAlignment = 16;
 
   if (StackAlignment)

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86Subtarget.h Sun Jul  6 15:45:41 2008
@@ -67,7 +67,10 @@
   
   /// DarwinVers - Nonzero if this is a darwin platform: the numeric
   /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
-  unsigned char DarwinVers; // Is any darwin-ppc platform.
+  unsigned char DarwinVers; // Is any darwin-x86 platform.
+
+  /// isLinux - true if this is a "linux" platform.
+  bool IsLinux;
 
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
@@ -132,7 +135,9 @@
   bool isFlavorIntel() const { return AsmFlavor == Intel; }
 
   bool isTargetDarwin() const { return TargetType == isDarwin; }
-  bool isTargetELF() const { return TargetType == isELF; }
+  bool isTargetELF() const {
+    return TargetType == isELF;
+  }
   bool isTargetWindows() const { return TargetType == isWindows; }
   bool isTargetMingw() const { return TargetType == isMingw; }
   bool isTargetCygMing() const { return (TargetType == isMingw ||
@@ -164,6 +169,9 @@
   /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
   unsigned getDarwinVers() const { return DarwinVers; }
   
+  /// isLinux - Return true if the target is "Linux".
+  bool isLinux() const { return IsLinux; }
+
   /// True if accessing the GV requires an extra load. For Windows, dllimported
   /// symbols are indirect, loading the value at address GV rather then the
   /// value of GV itself. This means that the GlobalAddress must be in the base

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86TargetAsmInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86TargetAsmInfo.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86TargetAsmInfo.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86TargetAsmInfo.cpp Sun Jul  6 15:45:41 2008
@@ -66,6 +66,7 @@
     ReadOnlySection = "\t.const\n";
     LCOMMDirective = "\t.lcomm\t";
     SwitchToSectionDirective = "\t.section ";
+    StringConstantPrefix = "\1LC";
     COMMDirectiveTakesAlignment = false;
     HasDotTypeDotSizeDirective = false;
     if (TM.getRelocationModel() == Reloc::Static) {
@@ -120,8 +121,6 @@
     GlobalEHDirective = "\t.globl\t";
     SupportsWeakOmittedEHFrame = false;
     AbsoluteEHSectionOffsets = false;
-    if (Subtarget->is64Bit())
-      ShortenEHDataOn64Bit = true;
     DwarfEHFrameSection =
     ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support";
     DwarfExceptionSection = ".section __DATA,__gcc_except_tab";
@@ -229,6 +228,10 @@
     SectionEndDirectiveSuffix = "\tends\n";
   }
 
+  // On Linux we must declare when we can use a non-executable stack.
+  if (Subtarget->isLinux())
+    NonexecutableStackDirective = "\t.section\t.note.GNU-stack,\"\", at progbits";
+
   AssemblerDialect = Subtarget->getAsmFlavor();
 }
 

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.cpp (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.cpp Sun Jul  6 15:45:41 2008
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetMachineRegistry.h"
-#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 /// X86TargetMachineModule - Note that this is used on hosts that cannot link
@@ -30,13 +29,11 @@
 extern "C" int X86TargetMachineModule;
 int X86TargetMachineModule = 0;
 
-namespace {
-  // Register the target.
-  RegisterTarget<X86_32TargetMachine>
-  X("x86",    "  32-bit X86: Pentium-Pro and above");
-  RegisterTarget<X86_64TargetMachine>
-  Y("x86-64", "  64-bit X86: EM64T and AMD64");
-}
+// Register the target.
+static RegisterTarget<X86_32TargetMachine>
+X("x86",    "  32-bit X86: Pentium-Pro and above");
+static RegisterTarget<X86_64TargetMachine>
+Y("x86-64", "  64-bit X86: EM64T and AMD64");
 
 const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
   return new X86TargetAsmInfo(*this);
@@ -161,6 +158,13 @@
   return false;
 }
 
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM, bool Fast) {
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  PM.add(createX86MaxStackAlignmentCalculatorPass());
+  return false;  // -print-machineinstr shouldn't print after this.
+}
+
 bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM, bool Fast) {
   PM.add(createX86FloatingPointStackifierPass());
   return true;  // -print-machineinstr should print after this.
@@ -175,10 +179,8 @@
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, bool Fast,
                                       bool DumpAsm, MachineCodeEmitter &MCE) {
   // FIXME: Move this to TargetJITInfo!
-  if (DefRelocModel == Reloc::Default) {
+  if (DefRelocModel == Reloc::Default)
     setRelocationModel(Reloc::Static);
-    Subtarget.setPICStyle(PICStyle::None);
-  }
   
   // JIT cannot ensure globals are placed in the lower 4G of address.
   if (Subtarget.is64Bit())

Modified: llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.h?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.h (original)
+++ llvm/branches/non-call-eh/lib/Target/X86/X86TargetMachine.h Sun Jul  6 15:45:41 2008
@@ -44,12 +44,12 @@
 
   virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
-  virtual       TargetJITInfo    *getJITInfo()         { return &JITInfo; }
-  virtual const TargetSubtarget  *getSubtargetImpl() const{ return &Subtarget; }
+  virtual       X86JITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const X86Subtarget     *getSubtargetImpl() const{ return &Subtarget; }
   virtual       X86TargetLowering *getTargetLowering() const { 
     return const_cast<X86TargetLowering*>(&TLInfo); 
   }
-  virtual const TargetRegisterInfo    *getRegisterInfo() const {
+  virtual const X86RegisterInfo  *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
   }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
@@ -61,7 +61,8 @@
   static unsigned getJITMatchQuality();
   
   // Set up the pass pipeline.
-  virtual bool addInstSelector(PassManagerBase &PM, bool Fast);  
+  virtual bool addInstSelector(PassManagerBase &PM, bool Fast);
+  virtual bool addPreRegAlloc(PassManagerBase &PM, bool Fast);
   virtual bool addPostRegAlloc(PassManagerBase &PM, bool Fast);
   virtual bool addAssemblyEmitter(PassManagerBase &PM, bool Fast, 
                                   std::ostream &Out);

Modified: llvm/branches/non-call-eh/lib/Transforms/Hello/Hello.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Hello/Hello.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Hello/Hello.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Hello/Hello.cpp Sun Jul  6 15:45:41 2008
@@ -36,10 +36,12 @@
       return false;
     }
   };
+}
 
-  char Hello::ID = 0;
-  RegisterPass<Hello> X("hello", "Hello World Pass");
+char Hello::ID = 0;
+static RegisterPass<Hello> X("hello", "Hello World Pass");
 
+namespace {
   // Hello2 - The second implementation with getAnalysisUsage implemented.
   struct Hello2 : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
@@ -58,7 +60,8 @@
       AU.setPreservesAll();
     };
   };
-  char Hello2::ID = 0;
-  RegisterPass<Hello2> Y("hello2",
-                        "Hello World Pass (with getAnalysisUsage implemented)");
 }
+
+char Hello2::ID = 0;
+static RegisterPass<Hello2>
+Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/ArgumentPromotion.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/ArgumentPromotion.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/ArgumentPromotion.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/ArgumentPromotion.cpp Sun Jul  6 15:45:41 2008
@@ -66,7 +66,8 @@
 
     virtual bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
     static char ID; // Pass identification, replacement for typeid
-    ArgPromotion(unsigned maxElements = 3) : CallGraphSCCPass((intptr_t)&ID), maxElements(maxElements) {}
+    ArgPromotion(unsigned maxElements = 3) : CallGraphSCCPass((intptr_t)&ID),
+                                             maxElements(maxElements) {}
 
   private:
     bool PromoteArguments(CallGraphNode *CGN);
@@ -74,15 +75,15 @@
     Function *DoPromotion(Function *F, 
                           SmallPtrSet<Argument*, 8> &ArgsToPromote,
                           SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
-	/// The maximum number of elements to expand, or 0 for unlimited.
-	unsigned maxElements;
+    /// The maximum number of elements to expand, or 0 for unlimited.
+    unsigned maxElements;
   };
-
-  char ArgPromotion::ID = 0;
-  RegisterPass<ArgPromotion> X("argpromotion",
-                               "Promote 'by reference' arguments to scalars");
 }
 
+char ArgPromotion::ID = 0;
+static RegisterPass<ArgPromotion>
+X("argpromotion", "Promote 'by reference' arguments to scalars");
+
 Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
   return new ArgPromotion(maxElements);
 }
@@ -147,16 +148,16 @@
     Argument *PtrArg = PointerArgs[i].first;
     if (isByVal) {
       const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
-      if (const StructType *STy = dyn_cast<StructType>(AgTy))
+      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
           DOUT << "argpromotion disable promoting argument '"
                << PtrArg->getName() << "' because it would require adding more "
                << "than " << maxElements << " arguments to the function.\n";
         } else {
-          // If all the elements are first class types, we can promote it.
+          // If all the elements are single-value types, we can promote it.
           bool AllSimple = true;
           for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-            if (!STy->getElementType(i)->isFirstClassType()) {
+            if (!STy->getElementType(i)->isSingleValueType()) {
               AllSimple = false;
               break;
             }
@@ -169,6 +170,7 @@
             continue;
           }
         }
+      }
     }
     
     // Otherwise, see if we can promote the pointer to its value.
@@ -260,8 +262,9 @@
       }
       // Ensure that all of the indices are constants.
       SmallVector<ConstantInt*, 8> Operands;
-      for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i)
-        if (ConstantInt *C = dyn_cast<ConstantInt>(GEP->getOperand(i)))
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+	   i != e; ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
           Operands.push_back(C);
         else
           return false;  // Not a constant operand GEP!
@@ -290,7 +293,7 @@
                << Arg->getName() << "' because it would require adding more "
                << "than " << maxElements << " arguments to the function.\n";
           // We limit aggregate promotion to only promoting up to a fixed number
-		  // of elements of the aggregate.
+          // of elements of the aggregate.
           return false;
         }
         GEPIndices.push_back(Operands);
@@ -477,15 +480,13 @@
 
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
-  NF->setCallingConv(F->getCallingConv());
+  NF->copyAttributesFrom(F);
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
   NF->setParamAttrs(PAListPtr::get(ParamAttrsVec.begin(), ParamAttrsVec.end()));
   ParamAttrsVec.clear();
-  
-  if (F->hasCollector())
-    NF->setCollector(F->getCollector());
+
   F->getParent()->getFunctionList().insert(F, NF);
   NF->takeName(F);
 

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/ConstantMerge.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/ConstantMerge.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/ConstantMerge.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/ConstantMerge.cpp Sun Jul  6 15:45:41 2008
@@ -38,11 +38,12 @@
     //
     bool runOnModule(Module &M);
   };
-
-  char ConstantMerge::ID = 0;
-  RegisterPass<ConstantMerge>X("constmerge","Merge Duplicate Global Constants");
 }
 
+char ConstantMerge::ID = 0;
+static RegisterPass<ConstantMerge>
+X("constmerge", "Merge Duplicate Global Constants");
+
 ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
 
 bool ConstantMerge::runOnModule(Module &M) {

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/DeadArgumentElimination.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/DeadArgumentElimination.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/DeadArgumentElimination.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/DeadArgumentElimination.cpp Sun Jul  6 15:45:41 2008
@@ -97,9 +97,13 @@
 
     void RemoveDeadArgumentsFromFunction(Function *F);
   };
-  char DAE::ID = 0;
-  RegisterPass<DAE> X("deadargelim", "Dead Argument Elimination");
+}
+
+char DAE::ID = 0;
+static RegisterPass<DAE>
+X("deadargelim", "Dead Argument Elimination");
 
+namespace {
   /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
   /// deletes arguments to functions which are external.  This is only for use
   /// by bugpoint.
@@ -107,11 +111,12 @@
     static char ID;
     virtual bool ShouldHackArguments() const { return true; }
   };
-  char DAH::ID = 0;
-  RegisterPass<DAH> Y("deadarghaX0r",
-                      "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)");
 }
 
+char DAH::ID = 0;
+static RegisterPass<DAH>
+Y("deadarghaX0r", "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)");
+
 /// createDeadArgEliminationPass - This pass removes arguments from functions
 /// which are not used by the body of the function.
 ///
@@ -158,10 +163,7 @@
 
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, Fn.getLinkage());
-  NF->setCallingConv(Fn.getCallingConv());
-  NF->setParamAttrs(Fn.getParamAttrs());
-  if (Fn.hasCollector())
-    NF->setCollector(Fn.getCollector());
+  NF->copyAttributesFrom(&Fn);
   Fn.getParent()->getFunctionList().insert(&Fn, NF);
   NF->takeName(&Fn);
 
@@ -300,6 +302,12 @@
     FunctionIntrinsicallyLive = true;
   else
     for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I) {
+      // If the function is PASSED IN as an argument, its address has been taken
+      if (I.getOperandNo() != 0) {
+        FunctionIntrinsicallyLive = true;
+        break;
+      }
+
       // If this use is anything other than a call site, the function is alive.
       CallSite CS = CallSite::get(*I);
       Instruction *TheCall = CS.getInstruction();
@@ -327,15 +335,6 @@
             RetValLiveness = Live;
             break;
           }
-
-      // If the function is PASSED IN as an argument, its address has been taken
-      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
-           AI != E; ++AI)
-        if (AI->get() == &F) {
-          FunctionIntrinsicallyLive = true;
-          break;
-        }
-      if (FunctionIntrinsicallyLive) break;
     }
 
   if (FunctionIntrinsicallyLive) {
@@ -551,10 +550,8 @@
 
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, F->getLinkage());
-  NF->setCallingConv(F->getCallingConv());
+  NF->copyAttributesFrom(F);
   NF->setParamAttrs(NewPAL);
-  if (F->hasCollector())
-    NF->setCollector(F->getCollector());
   F->getParent()->getFunctionList().insert(F, NF);
   NF->takeName(F);
 
@@ -626,7 +623,7 @@
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
-    Call->getParent()->getInstList().erase(Call);
+    Call->eraseFromParent();
   }
 
   // Since we have now created the new function, splice the body of the old
@@ -665,7 +662,7 @@
       }
 
   // Now that the old function is dead, delete it.
-  F->getParent()->getFunctionList().erase(F);
+  F->eraseFromParent();
 }
 
 bool DAE::runOnModule(Module &M) {

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/DeadTypeElimination.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/DeadTypeElimination.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/DeadTypeElimination.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/DeadTypeElimination.cpp Sun Jul  6 15:45:41 2008
@@ -43,10 +43,11 @@
       AU.addRequired<FindUsedTypes>();
     }
   };
-  char DTE::ID = 0;
-  RegisterPass<DTE> X("deadtypeelim", "Dead Type Elimination");
 }
 
+char DTE::ID = 0;
+static RegisterPass<DTE> X("deadtypeelim", "Dead Type Elimination");
+
 ModulePass *llvm::createDeadTypeEliminationPass() {
   return new DTE();
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/ExtractGV.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/ExtractGV.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/ExtractGV.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/ExtractGV.cpp Sun Jul  6 15:45:41 2008
@@ -123,10 +123,7 @@
         if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
           Function *New = Function::Create(I->getFunctionType(),
                                            GlobalValue::ExternalLinkage);
-          New->setCallingConv(I->getCallingConv());
-          New->setParamAttrs(I->getParamAttrs());
-          if (I->hasCollector())
-            New->setCollector(I->getCollector());
+          New->copyAttributesFrom(I);
 
           // If it's not the named function, delete the body of the function
           I->dropAllReferences();

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalDCE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalDCE.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalDCE.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalDCE.cpp Sun Jul  6 15:45:41 2008
@@ -49,10 +49,11 @@
     bool SafeToDestroyConstant(Constant* C);
     bool RemoveUnusedGlobalValue(GlobalValue &GV);
   };
-  char GlobalDCE::ID = 0;
-  RegisterPass<GlobalDCE> X("globaldce", "Dead Global Elimination");
 }
 
+char GlobalDCE::ID = 0;
+static RegisterPass<GlobalDCE> X("globaldce", "Dead Global Elimination");
+
 ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
 
 bool GlobalDCE::runOnModule(Module &M) {

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalOpt.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalOpt.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalOpt.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/GlobalOpt.cpp Sun Jul  6 15:45:41 2008
@@ -28,6 +28,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -67,13 +68,15 @@
     bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
   };
-
-  char GlobalOpt::ID = 0;
-  RegisterPass<GlobalOpt> X("globalopt", "Global Variable Optimizer");
 }
 
+char GlobalOpt::ID = 0;
+static RegisterPass<GlobalOpt> X("globalopt", "Global Variable Optimizer");
+
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
+namespace {
+
 /// GlobalStatus - As we analyze each global, keep track of some information
 /// about it.  If we find out that the address of the global is taken, none of
 /// this info will be accurate.
@@ -128,7 +131,7 @@
                    HasNonInstructionUser(false), HasPHIUser(false) {}
 };
 
-
+}
 
 /// ConstantIsDead - Return true if the specified constant is (transitively)
 /// dead.  The constant may be used by other constants (e.g. constant arrays and
@@ -455,7 +458,7 @@
 /// behavior of the program in a more fine-grained way.  We have determined that
 /// this transformation is safe already.  We return the first global variable we
 /// insert so that the caller can reprocess it.
-static GlobalVariable *SRAGlobal(GlobalVariable *GV) {
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
   // Make sure this global only has simple uses that we can SRA.
   if (!GlobalUsersSafeToSRA(GV))
     return 0;
@@ -467,8 +470,14 @@
   std::vector<GlobalVariable*> NewGlobals;
   Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
 
+  // Get the alignment of the global, either explicit or target-specific.
+  unsigned StartAlignment = GV->getAlignment();
+  if (StartAlignment == 0)
+    StartAlignment = TD.getABITypeAlignment(GV->getType());
+   
   if (const StructType *STy = dyn_cast<StructType>(Ty)) {
     NewGlobals.reserve(STy->getNumElements());
+    const StructLayout &Layout = *TD.getStructLayout(STy);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       Constant *In = getAggregateConstantElement(Init,
                                             ConstantInt::get(Type::Int32Ty, i));
@@ -480,19 +489,28 @@
                                                GV->isThreadLocal());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(i);
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
+      if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
+        NGV->setAlignment(NewAlign);
     }
   } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
     unsigned NumElements = 0;
     if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
       NumElements = ATy->getNumElements();
-    else if (const VectorType *PTy = dyn_cast<VectorType>(STy))
-      NumElements = PTy->getNumElements();
     else
-      assert(0 && "Unknown aggregate sequential type!");
+      NumElements = cast<VectorType>(STy)->getNumElements();
 
     if (NumElements > 16 && GV->hasNUsesOrMore(16))
       return 0; // It's not worth it.
     NewGlobals.reserve(NumElements);
+    
+    uint64_t EltSize = TD.getABITypeSize(STy->getElementType());
+    unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = getAggregateConstantElement(Init,
                                             ConstantInt::get(Type::Int32Ty, i));
@@ -505,6 +523,13 @@
                                                GV->isThreadLocal());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
     }
   }
 
@@ -679,8 +704,9 @@
       // Should handle GEP here.
       SmallVector<Constant*, 8> Idxs;
       Idxs.reserve(GEPI->getNumOperands()-1);
-      for (unsigned i = 1, e = GEPI->getNumOperands(); i != e; ++i)
-        if (Constant *C = dyn_cast<Constant>(GEPI->getOperand(i)))
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
           Idxs.push_back(C);
         else
           break;
@@ -804,6 +830,9 @@
                                              GV->getName()+".body",
                                              (Module *)NULL,
                                              GV->isThreadLocal());
+  // FIXME: This new global should have the alignment returned by malloc.  Code
+  // could depend on malloc returning large alignment (on the mac, 16 bytes) but
+  // this would only guarantee some lower alignment.
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   // Anything that used the malloc now uses the global directly.
@@ -844,7 +873,7 @@
           case ICmpInst::ICMP_ULE:
           case ICmpInst::ICMP_SLE:
           case ICmpInst::ICMP_EQ:
-            LV = BinaryOperator::createNot(LV, "notinit", CI);
+            LV = BinaryOperator::CreateNot(LV, "notinit", CI);
             break;
           case ICmpInst::ICMP_NE:
           case ICmpInst::ICMP_UGE:
@@ -956,8 +985,8 @@
     if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
       // We permit two users of the load: setcc comparing against the null
       // pointer, and a getelementptr of a specific form.
-      for (Value::use_iterator UI = LI->use_begin(), E = LI->use_end(); UI != E; 
-           ++UI) {
+      for (Value::use_iterator UI = LI->use_begin(), E = LI->use_end();
+	   UI != E; ++UI) {
         // Comparison against null is ok.
         if (ICmpInst *ICI = dyn_cast<ICmpInst>(*UI)) {
           if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
@@ -1054,7 +1083,8 @@
     GEPIdx.push_back(GEPI->getOperand(1));
     GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
     
-    Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx.begin(), GEPIdx.end(),
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr,
+                                             GEPIdx.begin(), GEPIdx.end(),
                                              GEPI->getName(), GEPI);
     GEPI->replaceAllUsesWith(NGEPI);
     GEPI->eraseFromParent();
@@ -1164,7 +1194,7 @@
     if (!RunningOr)
       RunningOr = Cond;   // First seteq
     else
-      RunningOr = BinaryOperator::createOr(RunningOr, Cond, "tmp", MI);
+      RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", MI);
   }
 
   // Split the basic block at the old malloc.
@@ -1250,9 +1280,10 @@
   else if (GetElementPtrInst *GEPI =dyn_cast<GetElementPtrInst>(StoredOnceVal)){
     // "getelementptr Ptr, 0, 0, 0" is really just a cast.
     bool IsJustACast = true;
-    for (unsigned i = 1, e = GEPI->getNumOperands(); i != e; ++i)
-      if (!isa<Constant>(GEPI->getOperand(i)) ||
-          !cast<Constant>(GEPI->getOperand(i))->isNullValue()) {
+    for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+         i != e; ++i)
+      if (!isa<Constant>(*i) ||
+          !cast<Constant>(*i)->isNullValue()) {
         IsJustACast = false;
         break;
       }
@@ -1464,11 +1495,11 @@
     // this global a local variable) we replace the global with a local alloca
     // in this function.
     //
-    // NOTE: It doesn't make sense to promote non first class types since we
+    // NOTE: It doesn't make sense to promote non single-value types since we
     // are just replacing static memory to stack memory.
     if (!GS.HasMultipleAccessingFunctions &&
         GS.AccessingFunction && !GS.HasNonInstructionUser &&
-        GV->getType()->getElementType()->isFirstClassType() &&
+        GV->getType()->getElementType()->isSingleValueType() &&
         GS.AccessingFunction->getName() == "main" &&
         GS.AccessingFunction->hasExternalLinkage()) {
       DOUT << "LOCALIZING GLOBAL: " << *GV;
@@ -1519,8 +1550,9 @@
 
       ++NumMarked;
       return true;
-    } else if (!GV->getInitializer()->getType()->isFirstClassType()) {
-      if (GlobalVariable *FirstNewGV = SRAGlobal(GV)) {
+    } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
+      if (GlobalVariable *FirstNewGV = SRAGlobal(GV, 
+                                                 getAnalysis<TargetData>())) {
         GVI = FirstNewGV;  // Don't skip the newly produced globals!
         return true;
       }
@@ -1576,8 +1608,9 @@
     if (!isa<CallInst>(User) && !isa<InvokeInst>(User)) return false;
 
     // See if the function address is passed as an argument.
-    for (unsigned i = 1, e = User->getNumOperands(); i != e; ++i)
-      if (User->getOperand(i) == F) return false;
+    for (User::op_iterator i = User->op_begin() + 1, e = User->op_end();
+	 i != e; ++i)
+      if (*i == F) return false;
   }
   return true;
 }
@@ -1682,8 +1715,8 @@
       if (!I->hasInitializer()) return 0;
       ConstantArray *CA = dyn_cast<ConstantArray>(I->getInitializer());
       if (!CA) return 0;
-      for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-        if (ConstantStruct *CS = dyn_cast<ConstantStruct>(CA->getOperand(i))) {
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        if (ConstantStruct *CS = dyn_cast<ConstantStruct>(*i)) {
           if (isa<ConstantPointerNull>(CS->getOperand(1)))
             continue;
 
@@ -1710,8 +1743,8 @@
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
   std::vector<Function*> Result;
   Result.reserve(CA->getNumOperands());
-  for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
-    ConstantStruct *CS = cast<ConstantStruct>(CA->getOperand(i));
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
     Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
   }
   return Result;
@@ -1824,8 +1857,8 @@
 
     // Break up the constant into its elements.
     if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
-      for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i)
-        Elts.push_back(CS->getOperand(i));
+      for (User::op_iterator i = CS->op_begin(), e = CS->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
     } else if (isa<ConstantAggregateZero>(Init)) {
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         Elts.push_back(Constant::getNullValue(STy->getElementType(i)));
@@ -1852,8 +1885,8 @@
     // Break up the array into elements.
     std::vector<Constant*> Elts;
     if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) {
-      for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-        Elts.push_back(CA->getOperand(i));
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
     } else if (isa<ConstantAggregateZero>(Init)) {
       Constant *Elt = Constant::getNullValue(ATy->getElementType());
       Elts.assign(ATy->getNumElements(), Elt);
@@ -1981,8 +2014,9 @@
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(Values, GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
-      for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i)
-        GEPOps.push_back(getVal(Values, GEP->getOperand(i)));
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(Values, *i));
       InstResult = ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size());
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (LI->isVolatile()) return false;  // no volatile accesses.
@@ -2006,8 +2040,9 @@
       if (!Callee) return false;  // Cannot resolve.
 
       std::vector<Constant*> Formals;
-      for (unsigned i = 1, e = CI->getNumOperands(); i != e; ++i)
-        Formals.push_back(getVal(Values, CI->getOperand(i)));
+      for (User::op_iterator i = CI->op_begin() + 1, e = CI->op_end();
+           i != e; ++i)
+        Formals.push_back(getVal(Values, *i));
       
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/IPConstantPropagation.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/IPConstantPropagation.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/IPConstantPropagation.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/IPConstantPropagation.cpp Sun Jul  6 15:45:41 2008
@@ -21,6 +21,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/ADT/Statistic.h"
@@ -42,10 +43,12 @@
     bool PropagateConstantsIntoArguments(Function &F);
     bool PropagateConstantReturn(Function &F);
   };
-  char IPCP::ID = 0;
-  RegisterPass<IPCP> X("ipconstprop", "Interprocedural constant propagation");
 }
 
+char IPCP::ID = 0;
+static RegisterPass<IPCP>
+X("ipconstprop", "Interprocedural constant propagation");
+
 ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
 
 bool IPCP::runOnModule(Module &M) {
@@ -76,156 +79,199 @@
 bool IPCP::PropagateConstantsIntoArguments(Function &F) {
   if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
 
-  std::vector<std::pair<Constant*, bool> > ArgumentConstants;
+  // For each argument, keep track of its constant value and whether it is a
+  // constant or not.  The bool is driven to true when found to be non-constant.
+  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
   ArgumentConstants.resize(F.arg_size());
 
   unsigned NumNonconstant = 0;
-
-  for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I)
-    if (!isa<Instruction>(*I))
-      return false;  // Used by a non-instruction, do not transform
-    else {
-      CallSite CS = CallSite::get(cast<Instruction>(*I));
-      if (CS.getInstruction() == 0 ||
-          CS.getCalledFunction() != &F)
-        return false;  // Not a direct call site?
-
-      // Check out all of the potentially constant arguments
-      CallSite::arg_iterator AI = CS.arg_begin();
-      Function::arg_iterator Arg = F.arg_begin();
-      for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
-           ++i, ++AI, ++Arg) {
-        if (*AI == &F) return false;  // Passes the function into itself
-
-        if (!ArgumentConstants[i].second) {
-          if (Constant *C = dyn_cast<Constant>(*AI)) {
-            if (!ArgumentConstants[i].first)
-              ArgumentConstants[i].first = C;
-            else if (ArgumentConstants[i].first != C) {
-              // Became non-constant
-              ArgumentConstants[i].second = true;
-              ++NumNonconstant;
-              if (NumNonconstant == ArgumentConstants.size()) return false;
-            }
-          } else if (*AI != &*Arg) {    // Ignore recursive calls with same arg
-            // This is not a constant argument.  Mark the argument as
-            // non-constant.
-            ArgumentConstants[i].second = true;
-            ++NumNonconstant;
-            if (NumNonconstant == ArgumentConstants.size()) return false;
-          }
-        }
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    // Used by a non-instruction, or not the callee of a function, do not
+    // transform.
+    if (UI.getOperandNo() != 0 ||
+        (!isa<CallInst>(*UI) && !isa<InvokeInst>(*UI)))
+      return false;
+    
+    CallSite CS = CallSite::get(cast<Instruction>(*UI));
+
+    // Check out all of the potentially constant arguments.  Note that we don't
+    // inspect varargs here.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Function::arg_iterator Arg = F.arg_begin();
+    for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
+         ++i, ++AI, ++Arg) {
+      
+      // If this argument is known non-constant, ignore it.
+      if (ArgumentConstants[i].second)
+        continue;
+      
+      Constant *C = dyn_cast<Constant>(*AI);
+      if (C && ArgumentConstants[i].first == 0) {
+        ArgumentConstants[i].first = C;   // First constant seen.
+      } else if (C && ArgumentConstants[i].first == C) {
+        // Still the constant value we think it is.
+      } else if (*AI == &*Arg) {
+        // Ignore recursive calls passing argument down.
+      } else {
+        // Argument became non-constant.  If all arguments are non-constant now,
+        // give up on this function.
+        if (++NumNonconstant == ArgumentConstants.size())
+          return false;
+        ArgumentConstants[i].second = true;
       }
     }
+  }
 
   // If we got to this point, there is a constant argument!
   assert(NumNonconstant != ArgumentConstants.size());
-  Function::arg_iterator AI = F.arg_begin();
   bool MadeChange = false;
-  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI)
-    // Do we have a constant argument!?
-    if (!ArgumentConstants[i].second && !AI->use_empty()) {
-      Value *V = ArgumentConstants[i].first;
-      if (V == 0) V = UndefValue::get(AI->getType());
-      AI->replaceAllUsesWith(V);
-      ++NumArgumentsProped;
-      MadeChange = true;
-    }
+  Function::arg_iterator AI = F.arg_begin();
+  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
+    // Do we have a constant argument?
+    if (ArgumentConstants[i].second || AI->use_empty())
+      continue;
+  
+    Value *V = ArgumentConstants[i].first;
+    if (V == 0) V = UndefValue::get(AI->getType());
+    AI->replaceAllUsesWith(V);
+    ++NumArgumentsProped;
+    MadeChange = true;
+  }
   return MadeChange;
 }
 
 
-// Check to see if this function returns a constant.  If so, replace all callers
-// that user the return value with the returned valued.  If we can replace ALL
-// callers,
+// Check to see if this function returns one or more constants. If so, replace
+// all callers that use those return values with the constant value. This will
+// leave in the actual return values and instructions, but deadargelim will
+// clean that up.
+//
+// Additionally if a function always returns one of its arguments directly,
+// callers will be updated to use the value they pass in directly instead of
+// using the return value.
 bool IPCP::PropagateConstantReturn(Function &F) {
   if (F.getReturnType() == Type::VoidTy)
     return false; // No return value.
 
+  // If this function could be overridden later in the link stage, we can't
+  // propagate information about its results into callers.
+  if (F.hasLinkOnceLinkage() || F.hasWeakLinkage())
+    return false;
+  
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
   const StructType *STy = dyn_cast<StructType>(F.getReturnType());
   if (STy)
-    RetVals.assign(STy->getNumElements(), 0);
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
+      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
   else
-    RetVals.push_back(0);
+    RetVals.push_back(UndefValue::get(F.getReturnType()));
 
+  unsigned NumNonConstant = 0;
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      unsigned RetValsSize = RetVals.size();
-      assert (RetValsSize == RI->getNumOperands() && "Invalid ReturnInst operands!");
-      for (unsigned i = 0; i < RetValsSize; ++i) {
-        if (isa<UndefValue>(RI->getOperand(i))) {
-          // Ignore
-        } else if (Constant *C = dyn_cast<Constant>(RI->getOperand(i))) { 
-          Value *RV = RetVals[i];
-          if (RV == 0)
-            RetVals[i] = C;
-          else if (RV != C)
-            return false; // Does not return the same constant.
-        } else {
-          return false; // Does not return a constant.
-        }
-      }
-    }
-
-  if (STy) {
-    for (unsigned i = 0, e = RetVals.size(); i < e; ++i) 
-      if (RetVals[i] == 0) 
-        RetVals[i] = UndefValue::get(STy->getElementType(i));
-  } else {
-    if (RetVals.size() == 1) 
-      if (RetVals[0] == 0)
-        RetVals[0] = UndefValue::get(F.getReturnType());
-  }
-
-  // If we got here, the function returns a constant value.  Loop over all
-  // users, replacing any uses of the return value with the returned constant.
-  bool ReplacedAllUsers = true;
-  bool MadeChange = false;
-  for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I)
-    if (!isa<Instruction>(*I))
-      ReplacedAllUsers = false;
-    else {
-      CallSite CS = CallSite::get(cast<Instruction>(*I));
-      if (CS.getInstruction() == 0 ||
-          CS.getCalledFunction() != &F) {
-        ReplacedAllUsers = false;
-      } else {
-        Instruction *Call = CS.getInstruction();
-        if (!Call->use_empty()) {
-          if (RetVals.size() == 1)
-            Call->replaceAllUsesWith(RetVals[0]);
-          else {
-            for(Value::use_iterator CUI = Call->use_begin(), CUE = Call->use_end();
-                CUI != CUE; ++CUI) {
-              GetResultInst *GR = cast<GetResultInst>(CUI);
-              if (RetVals[GR->getIndex()]) {
-                GR->replaceAllUsesWith(RetVals[GR->getIndex()]);
-                GR->eraseFromParent();
-              }
+      // Return type does not match operand type, this is an old style multiple
+      // return
+      bool OldReturn = (F.getReturnType() != RI->getOperand(0)->getType());
+
+      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
+        // Already found conflicting return values?
+        Value *RV = RetVals[i];
+        if (!RV)
+          continue;
+
+        // Find the returned value
+        Value *V;
+        if (!STy || OldReturn)
+          V = RI->getOperand(i);
+        else
+          V = FindInsertedValue(RI->getOperand(0), i);
+
+        if (V) {
+          // Ignore undefs, we can change them into anything
+          if (isa<UndefValue>(V))
+            continue;
+          
+          // Try to see if all the rets return the same constant or argument.
+          if (isa<Constant>(V) || isa<Argument>(V)) {
+            if (isa<UndefValue>(RV)) {
+              // No value found yet? Try the current one.
+              RetVals[i] = V;
+              continue;
             }
+            // Returning the same value? Good.
+            if (RV == V)
+              continue;
           }
-          MadeChange = true;
         }
+        // Different or no known return value? Don't propagate this return
+        // value.
+        RetVals[i] = 0;
+        // All values non constant? Stop looking.
+        if (++NumNonConstant == RetVals.size())
+          return false;
       }
     }
 
-  // If we replace all users with the returned constant, and there can be no
-  // other callers of the function, replace the constant being returned in the
-  // function with an undef value.
-  if (ReplacedAllUsers && F.hasInternalLinkage()) {
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-        for (unsigned i = 0, e = RetVals.size(); i < e; ++i) {
-          Value *RetVal = RetVals[i];
-          if (isa<UndefValue>(RetVal))
-            continue;
-          Value *RV = UndefValue::get(RetVal->getType());
-          if (RI->getOperand(i) != RV) {
-            RI->setOperand(i, RV);
-            MadeChange = true;
-          }
+  // If we got here, the function returns at least one constant value.  Loop
+  // over all users, replacing any uses of the return value with the returned
+  // constant.
+  bool MadeChange = false;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    Instruction* Call = CS.getInstruction();
+
+    // Not a call instruction or a call instruction that's not calling F
+    // directly?
+    if (!Call || UI.getOperandNo() != 0)
+      continue;
+    
+    // Call result not used?
+    if (Call->use_empty())
+      continue;
+
+    MadeChange = true;
+
+    if (STy == 0) {
+      Value* New = RetVals[0];
+      if (Argument *A = dyn_cast<Argument>(New))
+        // Was an argument returned? Then find the corresponding argument in
+        // the call instruction and use that.
+        New = CS.getArgument(A->getArgNo());
+      Call->replaceAllUsesWith(New);
+      continue;
+    }
+   
+    for (Value::use_iterator I = Call->use_begin(), E = Call->use_end();
+         I != E;) {
+      Instruction *Ins = dyn_cast<Instruction>(*I);
+
+      // Increment now, so we can remove the use
+      ++I;
+
+      // Not an instruction? Ignore
+      if (!Ins)
+        continue;
+
+      // Find the index of the retval to replace with
+      int index = -1;
+      if (GetResultInst *GR = dyn_cast<GetResultInst>(Ins))
+        index = GR->getIndex();
+      else if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
+        if (EV->hasIndices())
+          index = *EV->idx_begin();
+
+      // If this use uses a specific return value, and we have a replacement,
+      // replace it.
+      if (index != -1) {
+        Value *New = RetVals[index];
+        if (New) {
+          if (Argument *A = dyn_cast<Argument>(New))
+            // Was an argument returned? Then find the corresponding argument in
+            // the call instruction and use that.
+            New = CS.getArgument(A->getArgNo());
+          Ins->replaceAllUsesWith(New);
+          Ins->eraseFromParent();
         }
       }
     }

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/IndMemRemoval.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/IndMemRemoval.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/IndMemRemoval.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/IndMemRemoval.cpp Sun Jul  6 15:45:41 2008
@@ -37,10 +37,11 @@
 
     virtual bool runOnModule(Module &M);
   };
-  char IndMemRemPass::ID = 0;
-  RegisterPass<IndMemRemPass> X("indmemrem","Indirect Malloc and Free Removal");
 } // end anonymous namespace
 
+char IndMemRemPass::ID = 0;
+static RegisterPass<IndMemRemPass>
+X("indmemrem","Indirect Malloc and Free Removal");
 
 bool IndMemRemPass::runOnModule(Module &M) {
   //in Theory, all direct calls of malloc and free should be promoted
@@ -71,7 +72,7 @@
                                       GlobalValue::LinkOnceLinkage, 
                                       "malloc_llvm_bounce", &M);
       BasicBlock* bb = BasicBlock::Create("entry",FN);
-      Instruction* c = CastInst::createIntegerCast(
+      Instruction* c = CastInst::CreateIntegerCast(
           FN->arg_begin(), Type::Int32Ty, false, "c", bb);
       Instruction* a = new MallocInst(Type::Int8Ty, c, "m", bb);
       ReturnInst::Create(a, bb);

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/InlineSimple.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/InlineSimple.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/InlineSimple.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/InlineSimple.cpp Sun Jul  6 15:45:41 2008
@@ -45,10 +45,12 @@
     }
     virtual bool doInitialization(CallGraph &CG);
   };
-  char SimpleInliner::ID = 0;
-  RegisterPass<SimpleInliner> X("inline", "Function Integration/Inlining");
 }
 
+char SimpleInliner::ID = 0;
+static RegisterPass<SimpleInliner>
+X("inline", "Function Integration/Inlining");
+
 Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
 
 Pass *llvm::createFunctionInliningPass(int Threshold) { 

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/Inliner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/Inliner.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/Inliner.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/Inliner.cpp Sun Jul  6 15:45:41 2008
@@ -30,11 +30,9 @@
 STATISTIC(NumInlined, "Number of functions inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 
-namespace {
-  cl::opt<int>
-  InlineLimit("inline-threshold", cl::Hidden, cl::init(200),
+static cl::opt<int>
+InlineLimit("inline-threshold", cl::Hidden, cl::init(200),
         cl::desc("Control the amount of inlining to perform (default = 200)"));
-}
 
 Inliner::Inliner(const void *ID) 
   : CallGraphSCCPass((intptr_t)ID), InlineThreshold(InlineLimit) {}

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/Internalize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/Internalize.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/Internalize.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/Internalize.cpp Sun Jul  6 15:45:41 2008
@@ -28,47 +28,47 @@
 STATISTIC(NumFunctions, "Number of functions internalized");
 STATISTIC(NumGlobals  , "Number of global vars internalized");
 
-namespace {
-
-  // APIFile - A file which contains a list of symbols that should not be marked
-  // external.
-  cl::opt<std::string>
-  APIFile("internalize-public-api-file", cl::value_desc("filename"),
-          cl::desc("A file containing list of symbol names to preserve"));
-
-  // APIList - A list of symbols that should not be marked internal.
-  cl::list<std::string>
-  APIList("internalize-public-api-list", cl::value_desc("list"),
-          cl::desc("A list of symbol names to preserve"),
-          cl::CommaSeparated);
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+APIFile("internalize-public-api-file", cl::value_desc("filename"),
+        cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+APIList("internalize-public-api-list", cl::value_desc("list"),
+        cl::desc("A list of symbol names to preserve"),
+        cl::CommaSeparated);
 
+namespace {
   class VISIBILITY_HIDDEN InternalizePass : public ModulePass {
     std::set<std::string> ExternalNames;
-    bool DontInternalize;
+    /// If no api symbols were specified and a main function is defined,
+    /// assume the main function is the only API
+    bool AllButMain;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit InternalizePass(bool InternalizeEverything = true);
+    explicit InternalizePass(bool AllButMain = true);
     explicit InternalizePass(const std::vector <const char *>& exportList);
     void LoadFile(const char *Filename);
     virtual bool runOnModule(Module &M);
   };
-  char InternalizePass::ID = 0;
-  RegisterPass<InternalizePass> X("internalize", "Internalize Global Symbols");
 } // end anonymous namespace
 
-InternalizePass::InternalizePass(bool InternalizeEverything) 
-  : ModulePass((intptr_t)&ID), DontInternalize(false){
-  if (!APIFile.empty())           // If a filename is specified, use it
+char InternalizePass::ID = 0;
+static RegisterPass<InternalizePass>
+X("internalize", "Internalize Global Symbols");
+
+InternalizePass::InternalizePass(bool AllButMain)
+  : ModulePass((intptr_t)&ID), AllButMain(AllButMain){
+  if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
-  else if (!APIList.empty())      // Else, if a list is specified, use it.
+  if (!APIList.empty())           // If a list is specified, use it as well.
     ExternalNames.insert(APIList.begin(), APIList.end());
-  else if (!InternalizeEverything)
-    // Finally, if we're allowed to, internalize all but main.
-    DontInternalize = true;
 }
 
 InternalizePass::InternalizePass(const std::vector<const char *>&exportList) 
-  : ModulePass((intptr_t)&ID), DontInternalize(false){
+  : ModulePass((intptr_t)&ID), AllButMain(false){
   for(std::vector<const char *>::const_iterator itr = exportList.begin();
         itr != exportList.end(); itr++) {
     ExternalNames.insert(*itr);
@@ -79,8 +79,9 @@
   // Load the APIFile...
   std::ifstream In(Filename);
   if (!In.good()) {
-    cerr << "WARNING: Internalize couldn't load file '" << Filename << "'!\n";
-    return;   // Do not internalize anything...
+    cerr << "WARNING: Internalize couldn't load file '" << Filename 
+         << "'! Continuing as if it's empty.\n";
+    return; // Just continue as if the file were empty
   }
   while (In) {
     std::string Symbol;
@@ -91,13 +92,14 @@
 }
 
 bool InternalizePass::runOnModule(Module &M) {
-  if (DontInternalize) return false;
-  
-  // If no list or file of symbols was specified, check to see if there is a
-  // "main" symbol defined in the module.  If so, use it, otherwise do not
-  // internalize the module, it must be a library or something.
-  //
   if (ExternalNames.empty()) {
+    // Return if we're not in 'all but main' mode and have no external api
+    if (!AllButMain)
+      return false; 
+    // If no list or file of symbols was specified, check to see if there is a
+    // "main" symbol defined in the module.  If so, use it, otherwise do not
+    // internalize the module, it must be a library or something.
+    //
     Function *MainFunc = M.getFunction("main");
     if (MainFunc == 0 || MainFunc->isDeclaration())
       return false;  // No main found, must be a library...
@@ -108,7 +110,7 @@
   
   bool Changed = false;
   
-  // Found a main function, mark all functions not named main as internal.
+  // Mark all functions not in the api as internal.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     if (!I->isDeclaration() &&         // Function must be defined here
         !I->hasInternalLinkage() &&  // Can't already have internal linkage
@@ -133,7 +135,8 @@
   ExternalNames.insert("llvm.noinline");
   ExternalNames.insert("llvm.global.annotations");
       
-  // Mark all global variables with initializers as internal as well.
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I)
     if (!I->isDeclaration() && !I->hasInternalLinkage() &&
@@ -147,8 +150,8 @@
   return Changed;
 }
 
-ModulePass *llvm::createInternalizePass(bool InternalizeEverything) {
-  return new InternalizePass(InternalizeEverything);
+ModulePass *llvm::createInternalizePass(bool AllButMain) {
+  return new InternalizePass(AllButMain);
 }
 
 ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/LoopExtractor.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/LoopExtractor.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/LoopExtractor.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/LoopExtractor.cpp Sun Jul  6 15:45:41 2008
@@ -52,22 +52,24 @@
       AU.addRequired<LoopInfo>();
     }
   };
+}
 
-  char LoopExtractor::ID = 0;
-  RegisterPass<LoopExtractor>
-  X("loop-extract", "Extract loops into new functions");
+char LoopExtractor::ID = 0;
+static RegisterPass<LoopExtractor>
+X("loop-extract", "Extract loops into new functions");
 
+namespace {
   /// SingleLoopExtractor - For bugpoint.
   struct SingleLoopExtractor : public LoopExtractor {
     static char ID; // Pass identification, replacement for typeid
     SingleLoopExtractor() : LoopExtractor(1) {}
   };
-
-  char SingleLoopExtractor::ID = 0;
-  RegisterPass<SingleLoopExtractor>
-  Y("loop-extract-single", "Extract at most one loop into a new function");
 } // End anonymous namespace
 
+char SingleLoopExtractor::ID = 0;
+static RegisterPass<SingleLoopExtractor>
+Y("loop-extract-single", "Extract at most one loop into a new function");
+
 // createLoopExtractorPass - This pass extracts all natural loops from the
 // program into a function if it can.
 //
@@ -146,14 +148,14 @@
 }
 
 
-namespace {
-  // BlockFile - A file which contains a list of blocks that should not be
-  // extracted.
-  cl::opt<std::string>
-  BlockFile("extract-blocks-file", cl::value_desc("filename"),
-            cl::desc("A file containing list of basic blocks to not extract"),
-            cl::Hidden);
+// BlockFile - A file which contains a list of blocks that should not be
+// extracted.
+static cl::opt<std::string>
+BlockFile("extract-blocks-file", cl::value_desc("filename"),
+          cl::desc("A file containing list of basic blocks to not extract"),
+          cl::Hidden);
 
+namespace {
   /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
   /// from the module into their own functions except for those specified by the
   /// BlocksToNotExtract list.
@@ -173,12 +175,12 @@
 
     bool runOnModule(Module &M);
   };
-
-  char BlockExtractorPass::ID = 0;
-  RegisterPass<BlockExtractorPass>
-  XX("extract-blocks", "Extract Basic Blocks From Module (for bugpoint use)");
 }
 
+char BlockExtractorPass::ID = 0;
+static RegisterPass<BlockExtractorPass>
+XX("extract-blocks", "Extract Basic Blocks From Module (for bugpoint use)");
+
 // createBlockExtractorPass - This pass extracts all blocks (except those
 // specified in the argument list) from the functions in the module.
 //

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/LowerSetJmp.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/LowerSetJmp.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/LowerSetJmp.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/LowerSetJmp.cpp Sun Jul  6 15:45:41 2008
@@ -122,11 +122,11 @@
     bool runOnModule(Module& M);
     bool doInitialization(Module& M);
   };
-
-  char LowerSetJmp::ID = 0;
-  RegisterPass<LowerSetJmp> X("lowersetjmp", "Lower Set Jump");
 } // end anonymous namespace
 
+char LowerSetJmp::ID = 0;
+static RegisterPass<LowerSetJmp> X("lowersetjmp", "Lower Set Jump");
+
 // run - Run the transformation on the program. We grab the function
 // prototypes for longjmp and setjmp. If they are used in the program,
 // then we can go directly to the places they're at and transform them.
@@ -341,29 +341,25 @@
   if (SwitchValMap[Func].first) return SwitchValMap[Func];
 
   BasicBlock* LongJmpPre = BasicBlock::Create("LongJmpBlkPre", Func);
-  BasicBlock::InstListType& LongJmpPreIL = LongJmpPre->getInstList();
 
   // Keep track of the preliminary basic block for some of the other
   // transformations.
   PrelimBBMap[Func] = LongJmpPre;
 
   // Grab the exception.
-  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept");
-  LongJmpPreIL.push_back(Cond);
+  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre);
 
   // The "decision basic block" gets the number associated with the
   // setjmp call returning to switch on and the value returned by
   // longjmp.
   BasicBlock* DecisionBB = BasicBlock::Create("LJDecisionBB", Func);
-  BasicBlock::InstListType& DecisionBBIL = DecisionBB->getInstList();
 
   BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre);
 
   // Fill in the "decision" basic block.
-  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal");
-  DecisionBBIL.push_back(LJVal);
-  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum");
-  DecisionBBIL.push_back(SJNum);
+  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB);
+  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum",
+                                     DecisionBB);
 
   SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB);
   return SwitchValMap[Func] = SwitchValuePair(SI, LJVal);
@@ -444,7 +440,7 @@
   // Replace all uses of this instruction with the PHI node created by
   // the eradication of setjmp.
   Inst->replaceAllUsesWith(PHI);
-  Inst->getParent()->getInstList().erase(Inst);
+  Inst->eraseFromParent();
 
   ++SetJmpsTransformed;
 }
@@ -482,10 +478,10 @@
 
   // Replace the old call inst with the invoke inst and remove the call.
   CI.replaceAllUsesWith(II);
-  CI.getParent()->getInstList().erase(&CI);
+  CI.eraseFromParent();
 
   // The old terminator is useless now that we have the invoke inst.
-  Term->getParent()->getInstList().erase(Term);
+  Term->eraseFromParent();
   ++CallsTransformed;
 }
 
@@ -508,12 +504,11 @@
 
   Function* Func = BB->getParent();
   BasicBlock* NewExceptBB = BasicBlock::Create("InvokeExcept", Func);
-  BasicBlock::InstListType& InstList = NewExceptBB->getInstList();
 
   // If this is a longjmp exception, then branch to the preliminary BB of
   // the longjmp exception handling. Otherwise, go to the old exception.
-  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept");
-  InstList.push_back(IsLJExcept);
+  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept",
+                                          NewExceptBB);
 
   BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB);
 

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/PruneEH.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/PruneEH.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/PruneEH.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/PruneEH.cpp Sun Jul  6 15:45:41 2008
@@ -31,7 +31,6 @@
 
 STATISTIC(NumRemoved, "Number of invokes removed");
 STATISTIC(NumUnreach, "Number of noreturn calls optimized");
-STATISTIC(NumBBUnwind, "Number of unwind dest removed from blocks");
 
 namespace {
   struct VISIBILITY_HIDDEN PruneEH : public CallGraphSCCPass {
@@ -44,11 +43,12 @@
     bool SimplifyFunction(Function *F);
     void DeleteBasicBlock(BasicBlock *BB);
   };
-
-  char PruneEH::ID = 0;
-  RegisterPass<PruneEH> X("prune-eh", "Remove unused exception handling info");
 }
 
+char PruneEH::ID = 0;
+static RegisterPass<PruneEH>
+X("prune-eh", "Remove unused exception handling info");
+
 Pass *llvm::createPruneEHPass() { return new PruneEH(); }
 
 
@@ -64,6 +64,8 @@
 
   // Next, check to see if any callees might throw or if there are any external
   // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be 
+  // overridden at linktime with something that throws, so assume that.
   // If this SCC includes the unwind instruction, we KNOW it throws, so
   // obviously the SCC might throw.
   //
@@ -74,7 +76,7 @@
     if (F == 0) {
       SCCMightUnwind = true;
       SCCMightReturn = true;
-    } else if (F->isDeclaration()) {
+    } else if (F->isDeclaration() || F->hasWeakLinkage()) {
       SCCMightUnwind |= !F->doesNotThrow();
       SCCMightReturn |= !F->doesNotReturn();
     } else {
@@ -152,8 +154,6 @@
 bool PruneEH::SimplifyFunction(Function *F) {
   bool MadeChange = false;
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-    bool couldUnwind = false;
-
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow()) {
         SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
@@ -183,12 +183,10 @@
 
         ++NumRemoved;
         MadeChange = true;
-      } else {
-        couldUnwind = true;
       }
 
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
-      if (CallInst *CI = dyn_cast<CallInst>(I++)) {
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
         if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) {
           // This call calls a function that cannot return.  Insert an
           // unreachable instruction after it and simplify the code.  Do this
@@ -204,19 +202,9 @@
           MadeChange = true;
           ++NumUnreach;
           break;
-        } else if (!CI->doesNotThrow()) {
-          couldUnwind = true;
         }
-      }
-
-    // Strip 'unwindTo' off of BBs that have no calls/invokes without nounwind.
-    if (!couldUnwind && BB->getUnwindDest()) {
-      MadeChange = true;
-      ++NumBBUnwind;
-      BB->getUnwindDest()->removePredecessor(BB, false, true);
-      BB->setUnwindDest(NULL);
-    }
   }
+
   return MadeChange;
 }
 

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/RaiseAllocations.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/RaiseAllocations.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/RaiseAllocations.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/RaiseAllocations.cpp Sun Jul  6 15:45:41 2008
@@ -48,12 +48,11 @@
     //
     bool runOnModule(Module &M);
   };
-
-  char RaiseAllocations::ID = 0;
-  RegisterPass<RaiseAllocations>
-  X("raiseallocs", "Raise allocations from calls to instructions");
 }  // end anonymous namespace
 
+char RaiseAllocations::ID = 0;
+static RegisterPass<RaiseAllocations>
+X("raiseallocs", "Raise allocations from calls to instructions");
 
 // createRaiseAllocationsPass - The interface to this file...
 ModulePass *llvm::createRaiseAllocationsPass() {
@@ -165,7 +164,7 @@
           // source size.
           if (Source->getType() != Type::Int32Ty)
             Source = 
-              CastInst::createIntegerCast(Source, Type::Int32Ty, false/*ZExt*/,
+              CastInst::CreateIntegerCast(Source, Type::Int32Ty, false/*ZExt*/,
                                           "MallocAmtCast", I);
 
           MallocInst *MI = new MallocInst(Type::Int8Ty, Source, "", I);
@@ -178,7 +177,7 @@
             BranchInst::Create(II->getNormalDest(), I);
 
           // Delete the old call site
-          MI->getParent()->getInstList().erase(I);
+          I->eraseFromParent();
           Changed = true;
           ++NumRaised;
         }

Removed: llvm/branches/non-call-eh/lib/Transforms/IPO/SimplifyLibCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/SimplifyLibCalls.cpp?rev=53162&view=auto

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/SimplifyLibCalls.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/SimplifyLibCalls.cpp (removed)
@@ -1,2085 +0,0 @@
-//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a module pass that applies a variety of small
-// optimizations for calls to specific well-known function calls (e.g. runtime
-// library functions). For example, a call to the function "exit(3)" that
-// occurs within the main() function can be transformed into a simple "return 3"
-// instruction. Any optimization that takes this form (replace call to library
-// function with simpler code that provides the same result) belongs in this
-// file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "simplify-libcalls"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Config/config.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/IPO.h"
-#include <cstring>
-using namespace llvm;
-
-/// This statistic keeps track of the total number of library calls that have
-/// been simplified regardless of which call it is.
-STATISTIC(SimplifiedLibCalls, "Number of library calls simplified");
-
-namespace {
-  // Forward declarations
-  class LibCallOptimization;
-  class SimplifyLibCalls;
-  
-/// This list is populated by the constructor for LibCallOptimization class.
-/// Therefore all subclasses are registered here at static initialization time
-/// and this list is what the SimplifyLibCalls pass uses to apply the individual
-/// optimizations to the call sites.
-/// @brief The list of optimizations deriving from LibCallOptimization
-static LibCallOptimization *OptList = 0;
-
-/// This class is the abstract base class for the set of optimizations that
-/// corresponds to one library call. The SimplifyLibCalls pass will call the
-/// ValidateCalledFunction method to ask the optimization if a given Function
-/// is the kind that the optimization can handle. If the subclass returns true,
-/// then SImplifyLibCalls will also call the OptimizeCall method to perform,
-/// or attempt to perform, the optimization(s) for the library call. Otherwise,
-/// OptimizeCall won't be called. Subclasses are responsible for providing the
-/// name of the library call (strlen, strcpy, etc.) to the LibCallOptimization
-/// constructor. This is used to efficiently select which call instructions to
-/// optimize. The criteria for a "lib call" is "anything with well known
-/// semantics", typically a library function that is defined by an international
-/// standard. Because the semantics are well known, the optimizations can
-/// generally short-circuit actually calling the function if there's a simpler
-/// way (e.g. strlen(X) can be reduced to a constant if X is a constant global).
-/// @brief Base class for library call optimizations
-class VISIBILITY_HIDDEN LibCallOptimization {
-  LibCallOptimization **Prev, *Next;
-  const char *FunctionName; ///< Name of the library call we optimize
-#ifndef NDEBUG
-  Statistic occurrences; ///< debug statistic (-debug-only=simplify-libcalls)
-#endif
-public:
-  /// The \p fname argument must be the name of the library function being
-  /// optimized by the subclass.
-  /// @brief Constructor that registers the optimization.
-  LibCallOptimization(const char *FName, const char *Description)
-    : FunctionName(FName) {
-      
-#ifndef NDEBUG
-    occurrences.construct("simplify-libcalls", Description);
-#endif
-    // Register this optimizer in the list of optimizations.
-    Next = OptList;
-    OptList = this;
-    Prev = &OptList;
-    if (Next) Next->Prev = &Next;
-  }
-  
-  /// getNext - All libcall optimizations are chained together into a list,
-  /// return the next one in the list.
-  LibCallOptimization *getNext() { return Next; }
-
-  /// @brief Deregister from the optlist
-  virtual ~LibCallOptimization() {
-    *Prev = Next;
-    if (Next) Next->Prev = Prev;
-  }
-
-  /// The implementation of this function in subclasses should determine if
-  /// \p F is suitable for the optimization. This method is called by
-  /// SimplifyLibCalls::runOnModule to short circuit visiting all the call
-  /// sites of such a function if that function is not suitable in the first
-  /// place.  If the called function is suitabe, this method should return true;
-  /// false, otherwise. This function should also perform any lazy
-  /// initialization that the LibCallOptimization needs to do, if its to return
-  /// true. This avoids doing initialization until the optimizer is actually
-  /// going to be called upon to do some optimization.
-  /// @brief Determine if the function is suitable for optimization
-  virtual bool ValidateCalledFunction(
-    const Function* F,    ///< The function that is the target of call sites
-    SimplifyLibCalls& SLC ///< The pass object invoking us
-  ) = 0;
-
-  /// The implementations of this function in subclasses is the heart of the
-  /// SimplifyLibCalls algorithm. Sublcasses of this class implement
-  /// OptimizeCall to determine if (a) the conditions are right for optimizing
-  /// the call and (b) to perform the optimization. If an action is taken
-  /// against ci, the subclass is responsible for returning true and ensuring
-  /// that ci is erased from its parent.
-  /// @brief Optimize a call, if possible.
-  virtual bool OptimizeCall(
-    CallInst* ci,          ///< The call instruction that should be optimized.
-    SimplifyLibCalls& SLC  ///< The pass object invoking us
-  ) = 0;
-
-  /// @brief Get the name of the library call being optimized
-  const char *getFunctionName() const { return FunctionName; }
-
-  bool ReplaceCallWith(CallInst *CI, Value *V) {
-    if (!CI->use_empty())
-      CI->replaceAllUsesWith(V);
-    CI->eraseFromParent();
-    return true;
-  }
-  
-  /// @brief Called by SimplifyLibCalls to update the occurrences statistic.
-  void succeeded() {
-#ifndef NDEBUG
-    DEBUG(++occurrences);
-#endif
-  }
-};
-
-/// This class is an LLVM Pass that applies each of the LibCallOptimization
-/// instances to all the call sites in a module, relatively efficiently. The
-/// purpose of this pass is to provide optimizations for calls to well-known
-/// functions with well-known semantics, such as those in the c library. The
-/// class provides the basic infrastructure for handling runOnModule.  Whenever
-/// this pass finds a function call, it asks the appropriate optimizer to
-/// validate the call (ValidateLibraryCall). If it is validated, then
-/// the OptimizeCall method is also called.
-/// @brief A ModulePass for optimizing well-known function calls.
-class VISIBILITY_HIDDEN SimplifyLibCalls : public ModulePass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-  SimplifyLibCalls() : ModulePass((intptr_t)&ID) {}
-
-  /// We need some target data for accurate signature details that are
-  /// target dependent. So we require target data in our AnalysisUsage.
-  /// @brief Require TargetData from AnalysisUsage.
-  virtual void getAnalysisUsage(AnalysisUsage& Info) const {
-    // Ask that the TargetData analysis be performed before us so we can use
-    // the target data.
-    Info.addRequired<TargetData>();
-  }
-
-  /// For this pass, process all of the function calls in the module, calling
-  /// ValidateLibraryCall and OptimizeCall as appropriate.
-  /// @brief Run all the lib call optimizations on a Module.
-  virtual bool runOnModule(Module &M) {
-    reset(M);
-
-    bool result = false;
-    StringMap<LibCallOptimization*> OptznMap;
-    for (LibCallOptimization *Optzn = OptList; Optzn; Optzn = Optzn->getNext())
-      OptznMap[Optzn->getFunctionName()] = Optzn;
-
-    // The call optimizations can be recursive. That is, the optimization might
-    // generate a call to another function which can also be optimized. This way
-    // we make the LibCallOptimization instances very specific to the case they
-    // handle. It also means we need to keep running over the function calls in
-    // the module until we don't get any more optimizations possible.
-    bool found_optimization = false;
-    do {
-      found_optimization = false;
-      for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
-        // All the "well-known" functions are external and have external linkage
-        // because they live in a runtime library somewhere and were (probably)
-        // not compiled by LLVM.  So, we only act on external functions that
-        // have external or dllimport linkage and non-empty uses.
-        if (!FI->isDeclaration() ||
-            !(FI->hasExternalLinkage() || FI->hasDLLImportLinkage()) ||
-            FI->use_empty())
-          continue;
-
-        // Get the optimization class that pertains to this function
-        StringMap<LibCallOptimization*>::iterator OMI =
-          OptznMap.find(FI->getName());
-        if (OMI == OptznMap.end()) continue;
-        
-        LibCallOptimization *CO = OMI->second;
-
-        // Make sure the called function is suitable for the optimization
-        if (!CO->ValidateCalledFunction(FI, *this))
-          continue;
-
-        // Loop over each of the uses of the function
-        for (Value::use_iterator UI = FI->use_begin(), UE = FI->use_end();
-             UI != UE ; ) {
-          // If the use of the function is a call instruction
-          if (CallInst* CI = dyn_cast<CallInst>(*UI++)) {
-            // Do the optimization on the LibCallOptimization.
-            if (CO->OptimizeCall(CI, *this)) {
-              ++SimplifiedLibCalls;
-              found_optimization = result = true;
-              CO->succeeded();
-            }
-          }
-        }
-      }
-    } while (found_optimization);
-    
-    return result;
-  }
-
-  /// @brief Return the *current* module we're working on.
-  Module* getModule() const { return M; }
-
-  /// @brief Return the *current* target data for the module we're working on.
-  TargetData* getTargetData() const { return TD; }
-
-  /// @brief Return the size_t type -- syntactic shortcut
-  const Type* getIntPtrType() const { return TD->getIntPtrType(); }
-
-  /// @brief Return a Function* for the putchar libcall
-  Constant *get_putchar() {
-    if (!putchar_func)
-      putchar_func = 
-        M->getOrInsertFunction("putchar", Type::Int32Ty, Type::Int32Ty, NULL);
-    return putchar_func;
-  }
-
-  /// @brief Return a Function* for the puts libcall
-  Constant *get_puts() {
-    if (!puts_func)
-      puts_func = M->getOrInsertFunction("puts", Type::Int32Ty,
-                                         PointerType::getUnqual(Type::Int8Ty),
-                                         NULL);
-    return puts_func;
-  }
-
-  /// @brief Return a Function* for the fputc libcall
-  Constant *get_fputc(const Type* FILEptr_type) {
-    if (!fputc_func)
-      fputc_func = M->getOrInsertFunction("fputc", Type::Int32Ty, Type::Int32Ty,
-                                          FILEptr_type, NULL);
-    return fputc_func;
-  }
-
-  /// @brief Return a Function* for the fputs libcall
-  Constant *get_fputs(const Type* FILEptr_type) {
-    if (!fputs_func)
-      fputs_func = M->getOrInsertFunction("fputs", Type::Int32Ty,
-                                          PointerType::getUnqual(Type::Int8Ty),
-                                          FILEptr_type, NULL);
-    return fputs_func;
-  }
-
-  /// @brief Return a Function* for the fwrite libcall
-  Constant *get_fwrite(const Type* FILEptr_type) {
-    if (!fwrite_func)
-      fwrite_func = M->getOrInsertFunction("fwrite", TD->getIntPtrType(),
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           TD->getIntPtrType(),
-                                           TD->getIntPtrType(),
-                                           FILEptr_type, NULL);
-    return fwrite_func;
-  }
-
-  /// @brief Return a Function* for the sqrt libcall
-  Constant *get_sqrt() {
-    if (!sqrt_func)
-      sqrt_func = M->getOrInsertFunction("sqrt", Type::DoubleTy, 
-                                         Type::DoubleTy, NULL);
-    return sqrt_func;
-  }
-
-  /// @brief Return a Function* for the strcpy libcall
-  Constant *get_strcpy() {
-    if (!strcpy_func)
-      strcpy_func = M->getOrInsertFunction("strcpy",
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           NULL);
-    return strcpy_func;
-  }
-
-  /// @brief Return a Function* for the strlen libcall
-  Constant *get_strlen() {
-    if (!strlen_func)
-      strlen_func = M->getOrInsertFunction("strlen", TD->getIntPtrType(),
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           NULL);
-    return strlen_func;
-  }
-
-  /// @brief Return a Function* for the memchr libcall
-  Constant *get_memchr() {
-    if (!memchr_func)
-      memchr_func = M->getOrInsertFunction("memchr",
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           PointerType::getUnqual(Type::Int8Ty),
-                                           Type::Int32Ty, TD->getIntPtrType(),
-                                           NULL);
-    return memchr_func;
-  }
-
-  /// @brief Return a Function* for the memcpy libcall
-  Constant *get_memcpy() {
-    if (!memcpy_func) {
-      Intrinsic::ID IID = (TD->getIntPtrType() == Type::Int32Ty) ?
-        Intrinsic::memcpy_i32 : Intrinsic::memcpy_i64;
-      memcpy_func = Intrinsic::getDeclaration(M, IID);
-    }
-    return memcpy_func;
-  }
-
-  Constant *getUnaryFloatFunction(const char *BaseName, const Type *T = 0) {
-    if (T == 0) T = Type::FloatTy;
-    
-    char NameBuffer[20];
-    const char *Name;
-    if (T == Type::DoubleTy)
-      Name = BaseName;              // floor
-    else {
-      Name = NameBuffer;
-      unsigned NameLen = strlen(BaseName);
-      assert(NameLen < sizeof(NameBuffer)-2 && "Buffer too small");
-      memcpy(NameBuffer, BaseName, NameLen);
-      if (T == Type::FloatTy)
-        NameBuffer[NameLen] = 'f';  // floorf
-      else
-        NameBuffer[NameLen] = 'l';  // floorl
-      NameBuffer[NameLen+1] = 0;
-    }
-    
-    return M->getOrInsertFunction(Name, T, T, NULL);
-  }
-  
-  Constant *get_floorf() { return getUnaryFloatFunction("floor"); }
-  Constant *get_ceilf()  { return getUnaryFloatFunction( "ceil"); }
-  Constant *get_roundf() { return getUnaryFloatFunction("round"); }
-  Constant *get_rintf()  { return getUnaryFloatFunction( "rint"); }
-  Constant *get_nearbyintf() { return getUnaryFloatFunction("nearbyint"); }
-
-  
-  
-private:
-  /// @brief Reset our cached data for a new Module
-  void reset(Module& mod) {
-    M = &mod;
-    TD = &getAnalysis<TargetData>();
-    putchar_func = 0;
-    puts_func = 0;
-    fputc_func = 0;
-    fputs_func = 0;
-    fwrite_func = 0;
-    memcpy_func = 0;
-    memchr_func = 0;
-    sqrt_func   = 0;
-    strcpy_func = 0;
-    strlen_func = 0;
-  }
-
-private:
-  /// Caches for function pointers.
-  Constant *putchar_func, *puts_func;
-  Constant *fputc_func, *fputs_func, *fwrite_func;
-  Constant *memcpy_func, *memchr_func;
-  Constant *sqrt_func;
-  Constant *strcpy_func, *strlen_func;
-  Module *M;             ///< Cached Module
-  TargetData *TD;        ///< Cached TargetData
-};
-
-char SimplifyLibCalls::ID = 0;
-// Register the pass
-RegisterPass<SimplifyLibCalls>
-X("simplify-libcalls", "Simplify well-known library calls");
-
-} // anonymous namespace
-
-// The only public symbol in this file which just instantiates the pass object
-ModulePass *llvm::createSimplifyLibCallsPass() {
-  return new SimplifyLibCalls();
-}
-
-// Classes below here, in the anonymous namespace, are all subclasses of the
-// LibCallOptimization class, each implementing all optimizations possible for a
-// single well-known library call. Each has a static singleton instance that
-// auto registers it into the "optlist" global above.
-namespace {
-
-// Forward declare utility functions.
-static bool GetConstantStringInfo(Value *V, std::string &Str);
-static Value *CastToCStr(Value *V, Instruction *IP);
-
-/// This LibCallOptimization will find instances of a call to "exit" that occurs
-/// within the "main" function and change it to a simple "ret" instruction with
-/// the same value passed to the exit function. When this is done, it splits the
-/// basic block at the exit(3) call and deletes the call instruction.
-/// @brief Replace calls to exit in main with a simple return
-struct VISIBILITY_HIDDEN ExitInMainOptimization : public LibCallOptimization {
-  ExitInMainOptimization() : LibCallOptimization("exit",
-      "Number of 'exit' calls simplified") {}
-
-  // Make sure the called function looks like exit (int argument, int return
-  // type, external linkage, not varargs).
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    return F->arg_size() >= 1 && F->arg_begin()->getType()->isInteger();
-  }
-
-  virtual bool OptimizeCall(CallInst* ci, SimplifyLibCalls& SLC) {
-    // To be careful, we check that the call to exit is coming from "main", that
-    // main has external linkage, and the return type of main and the argument
-    // to exit have the same type.
-    Function *from = ci->getParent()->getParent();
-    if (from->hasExternalLinkage())
-      if (from->getReturnType() == ci->getOperand(1)->getType()
-          && !isa<StructType>(from->getReturnType()))
-        if (from->getName() == "main") {
-          // Okay, time to actually do the optimization. First, get the basic
-          // block of the call instruction
-          BasicBlock* bb = ci->getParent();
-
-          // Create a return instruction that we'll replace the call with.
-          // Note that the argument of the return is the argument of the call
-          // instruction.
-          ReturnInst::Create(ci->getOperand(1), ci);
-
-          // Split the block at the call instruction which places it in a new
-          // basic block.
-          bb->splitBasicBlock(ci);
-
-          // The block split caused a branch instruction to be inserted into
-          // the end of the original block, right after the return instruction
-          // that we put there. That's not a valid block, so delete the branch
-          // instruction.
-          bb->getInstList().pop_back();
-
-          // Now we can finally get rid of the call instruction which now lives
-          // in the new basic block.
-          ci->eraseFromParent();
-
-          // Optimization succeeded, return true.
-          return true;
-        }
-    // We didn't pass the criteria for this optimization so return false
-    return false;
-  }
-} ExitInMainOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strcat library
-/// function. The simplification is possible only if the string being
-/// concatenated is a constant array or a constant expression that results in
-/// a constant string. In this case we can replace it with strlen + llvm.memcpy
-/// of the constant string. Both of these calls are further reduced, if possible
-/// on subsequent passes.
-/// @brief Simplify the strcat library function.
-struct VISIBILITY_HIDDEN StrCatOptimization : public LibCallOptimization {
-public:
-  /// @brief Default constructor
-  StrCatOptimization() : LibCallOptimization("strcat",
-      "Number of 'strcat' calls simplified") {}
-
-public:
-
-  /// @brief Make sure that the "strcat" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 &&
-           FT->getReturnType() == PointerType::getUnqual(Type::Int8Ty) &&
-           FT->getParamType(0) == FT->getReturnType() &&
-           FT->getParamType(1) == FT->getReturnType();
-  }
-
-  /// @brief Optimize the strcat library function
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // Extract some information from the instruction
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
-
-    // Extract the initializer (while making numerous checks) from the
-    // source operand of the call to strcat.
-    std::string SrcStr;
-    if (!GetConstantStringInfo(Src, SrcStr))
-      return false;
-
-    // Handle the simple, do-nothing case
-    if (SrcStr.empty())
-      return ReplaceCallWith(CI, Dst);
-
-    // We need to find the end of the destination string.  That's where the
-    // memory is to be moved to. We just generate a call to strlen.
-    CallInst *DstLen = CallInst::Create(SLC.get_strlen(), Dst,
-                                        Dst->getName()+".len", CI);
-
-    // Now that we have the destination's length, we must index into the
-    // destination's pointer to get the actual memcpy destination (end of
-    // the string .. we're concatenating).
-    Dst = GetElementPtrInst::Create(Dst, DstLen, Dst->getName()+".indexed", CI);
-
-    // We have enough information to now generate the memcpy call to
-    // do the concatenation for us.
-    Value *Vals[] = {
-      Dst, Src,
-      ConstantInt::get(SLC.getIntPtrType(), SrcStr.size()+1), // copy nul byte.
-      ConstantInt::get(Type::Int32Ty, 1)  // alignment
-    };
-    CallInst::Create(SLC.get_memcpy(), Vals, Vals + 4, "", CI);
-
-    return ReplaceCallWith(CI, Dst);
-  }
-} StrCatOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strchr library
-/// function.  It optimizes out cases where the arguments are both constant
-/// and the result can be determined statically.
-/// @brief Simplify the strcmp library function.
-struct VISIBILITY_HIDDEN StrChrOptimization : public LibCallOptimization {
-public:
-  StrChrOptimization() : LibCallOptimization("strchr",
-      "Number of 'strchr' calls simplified") {}
-
-  /// @brief Make sure that the "strchr" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 &&
-           FT->getReturnType() == PointerType::getUnqual(Type::Int8Ty) &&
-           FT->getParamType(0) == FT->getReturnType() &&
-           isa<IntegerType>(FT->getParamType(1));
-  }
-
-  /// @brief Perform the strchr optimizations
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // Check that the first argument to strchr is a constant array of sbyte.
-    std::string Str;
-    if (!GetConstantStringInfo(CI->getOperand(1), Str))
-      return false;
-
-    // If the second operand is not constant, just lower this to memchr since we
-    // know the length of the input string.
-    ConstantInt *CSI = dyn_cast<ConstantInt>(CI->getOperand(2));
-    if (!CSI) {
-      Value *Args[3] = {
-        CI->getOperand(1),
-        CI->getOperand(2),
-        ConstantInt::get(SLC.getIntPtrType(), Str.size()+1)
-      };
-      return ReplaceCallWith(CI, CallInst::Create(SLC.get_memchr(), Args, Args + 3,
-                                                  CI->getName(), CI));
-    }
-
-    // strchr can find the nul character.
-    Str += '\0';
-    
-    // Get the character we're looking for
-    char CharValue = CSI->getSExtValue();
-
-    // Compute the offset
-    uint64_t i = 0;
-    while (1) {
-      if (i == Str.size())    // Didn't find the char.  strchr returns null.
-        return ReplaceCallWith(CI, Constant::getNullValue(CI->getType()));
-      // Did we find our match?
-      if (Str[i] == CharValue)
-        break;
-      ++i;
-    }
-
-    // strchr(s+n,c)  -> gep(s+n+i,c)
-    //    (if c is a constant integer and s is a constant string)
-    Value *Idx = ConstantInt::get(Type::Int64Ty, i);
-    Value *GEP = GetElementPtrInst::Create(CI->getOperand(1), Idx, 
-                                           CI->getOperand(1)->getName() +
-                                           ".strchr", CI);
-    return ReplaceCallWith(CI, GEP);
-  }
-} StrChrOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strcmp library
-/// function.  It optimizes out cases where one or both arguments are constant
-/// and the result can be determined statically.
-/// @brief Simplify the strcmp library function.
-struct VISIBILITY_HIDDEN StrCmpOptimization : public LibCallOptimization {
-public:
-  StrCmpOptimization() : LibCallOptimization("strcmp",
-      "Number of 'strcmp' calls simplified") {}
-
-  /// @brief Make sure that the "strcmp" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getReturnType() == Type::Int32Ty && FT->getNumParams() == 2 &&
-           FT->getParamType(0) == FT->getParamType(1) &&
-           FT->getParamType(0) == PointerType::getUnqual(Type::Int8Ty);
-  }
-
-  /// @brief Perform the strcmp optimization
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // First, check to see if src and destination are the same. If they are,
-    // then the optimization is to replace the CallInst with a constant 0
-    // because the call is a no-op.
-    Value *Str1P = CI->getOperand(1);
-    Value *Str2P = CI->getOperand(2);
-    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 0));
-
-    std::string Str1;
-    if (!GetConstantStringInfo(Str1P, Str1))
-      return false;
-    if (Str1.empty()) {
-      // strcmp("", x) -> *x
-      Value *V = new LoadInst(Str2P, CI->getName()+".load", CI);
-      V = new ZExtInst(V, CI->getType(), CI->getName()+".int", CI);
-      return ReplaceCallWith(CI, V);
-    }
-
-    std::string Str2;
-    if (!GetConstantStringInfo(Str2P, Str2))
-      return false;
-    if (Str2.empty()) {
-      // strcmp(x,"") -> *x
-      Value *V = new LoadInst(Str1P, CI->getName()+".load", CI);
-      V = new ZExtInst(V, CI->getType(), CI->getName()+".int", CI);
-      return ReplaceCallWith(CI, V);
-    }
-
-    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
-    int R = strcmp(Str1.c_str(), Str2.c_str());
-    return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), R));
-  }
-} StrCmpOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strncmp library
-/// function.  It optimizes out cases where one or both arguments are constant
-/// and the result can be determined statically.
-/// @brief Simplify the strncmp library function.
-struct VISIBILITY_HIDDEN StrNCmpOptimization : public LibCallOptimization {
-public:
-  StrNCmpOptimization() : LibCallOptimization("strncmp",
-      "Number of 'strncmp' calls simplified") {}
-
-  /// @brief Make sure that the "strncmp" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getReturnType() == Type::Int32Ty && FT->getNumParams() == 3 &&
-           FT->getParamType(0) == FT->getParamType(1) &&
-           FT->getParamType(0) == PointerType::getUnqual(Type::Int8Ty) &&
-           isa<IntegerType>(FT->getParamType(2));
-    return false;
-  }
-
-  /// @brief Perform the strncmp optimization
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // First, check to see if src and destination are the same. If they are,
-    // then the optimization is to replace the CallInst with a constant 0
-    // because the call is a no-op.
-    Value *Str1P = CI->getOperand(1);
-    Value *Str2P = CI->getOperand(2);
-    if (Str1P == Str2P)  // strncmp(x,x, n)  -> 0
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 0));
-    
-    // Check the length argument, if it is Constant zero then the strings are
-    // considered equal.
-    uint64_t Length;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
-      Length = LengthArg->getZExtValue();
-    else
-      return false;
-    
-    if (Length == 0) // strncmp(x,y,0)   -> 0
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 0));
-    
-    std::string Str1;
-    if (!GetConstantStringInfo(Str1P, Str1))
-      return false;
-    if (Str1.empty()) {
-      // strncmp("", x, n) -> *x
-      Value *V = new LoadInst(Str2P, CI->getName()+".load", CI);
-      V = new ZExtInst(V, CI->getType(), CI->getName()+".int", CI);
-      return ReplaceCallWith(CI, V);
-    }
-    
-    std::string Str2;
-    if (!GetConstantStringInfo(Str2P, Str2))
-      return false;
-    if (Str2.empty()) {
-      // strncmp(x, "", n) -> *x
-      Value *V = new LoadInst(Str1P, CI->getName()+".load", CI);
-      V = new ZExtInst(V, CI->getType(), CI->getName()+".int", CI);
-      return ReplaceCallWith(CI, V);
-    }
-    
-    // strncmp(x, y, n)  -> cnst  (if both x and y are constant strings)
-    int R = strncmp(Str1.c_str(), Str2.c_str(), Length);
-    return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), R));
-  }
-} StrNCmpOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strcpy library
-/// function.  Two optimizations are possible:
-/// (1) If src and dest are the same and not volatile, just return dest
-/// (2) If the src is a constant then we can convert to llvm.memmove
-/// @brief Simplify the strcpy library function.
-struct VISIBILITY_HIDDEN StrCpyOptimization : public LibCallOptimization {
-public:
-  StrCpyOptimization() : LibCallOptimization("strcpy",
-      "Number of 'strcpy' calls simplified") {}
-
-  /// @brief Make sure that the "strcpy" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 &&
-           FT->getParamType(0) == FT->getParamType(1) &&
-           FT->getReturnType() == FT->getParamType(0) &&
-           FT->getParamType(0) == PointerType::getUnqual(Type::Int8Ty);
-  }
-
-  /// @brief Perform the strcpy optimization
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // First, check to see if src and destination are the same. If they are,
-    // then the optimization is to replace the CallInst with the destination
-    // because the call is a no-op. Note that this corresponds to the
-    // degenerate strcpy(X,X) case which should have "undefined" results
-    // according to the C specification. However, it occurs sometimes and
-    // we optimize it as a no-op.
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
-    if (Dst == Src) {
-      // strcpy(x, x) -> x
-      return ReplaceCallWith(CI, Dst);
-    }
-    
-    // Get the length of the constant string referenced by the Src operand.
-    std::string SrcStr;
-    if (!GetConstantStringInfo(Src, SrcStr))
-      return false;
-    
-    // If the constant string's length is zero we can optimize this by just
-    // doing a store of 0 at the first byte of the destination
-    if (SrcStr.empty()) {
-      new StoreInst(ConstantInt::get(Type::Int8Ty, 0), Dst, CI);
-      return ReplaceCallWith(CI, Dst);
-    }
-
-    // We have enough information to now generate the memcpy call to
-    // do the concatenation for us.
-    Value *MemcpyOps[] = {
-      Dst, Src, // Pass length including nul byte.
-      ConstantInt::get(SLC.getIntPtrType(), SrcStr.size()+1),
-      ConstantInt::get(Type::Int32Ty, 1) // alignment
-    };
-    CallInst::Create(SLC.get_memcpy(), MemcpyOps, MemcpyOps + 4, "", CI);
-
-    return ReplaceCallWith(CI, Dst);
-  }
-} StrCpyOptimizer;
-
-/// This LibCallOptimization will simplify a call to the strlen library
-/// function by replacing it with a constant value if the string provided to
-/// it is a constant array.
-/// @brief Simplify the strlen library function.
-struct VISIBILITY_HIDDEN StrLenOptimization : public LibCallOptimization {
-  StrLenOptimization() : LibCallOptimization("strlen",
-      "Number of 'strlen' calls simplified") {}
-
-  /// @brief Make sure that the "strlen" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 1 &&
-           FT->getParamType(0) == PointerType::getUnqual(Type::Int8Ty) &&
-           isa<IntegerType>(FT->getReturnType());
-  }
-
-  /// @brief Perform the strlen optimization
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // Make sure we're dealing with an sbyte* here.
-    Value *Src = CI->getOperand(1);
-
-    // Does the call to strlen have exactly one use?
-    if (CI->hasOneUse()) {
-      // Is that single use a icmp operator?
-      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CI->use_back()))
-        // Is it compared against a constant integer?
-        if (ConstantInt *Cst = dyn_cast<ConstantInt>(Cmp->getOperand(1))) {
-          // If its compared against length 0 with == or !=
-          if (Cst->getZExtValue() == 0 && Cmp->isEquality()) {
-            // strlen(x) != 0 -> *x != 0
-            // strlen(x) == 0 -> *x == 0
-            Value *V = new LoadInst(Src, Src->getName()+".first", CI);
-            V = new ICmpInst(Cmp->getPredicate(), V, 
-                             ConstantInt::get(Type::Int8Ty, 0),
-                             Cmp->getName()+".strlen", CI);
-            Cmp->replaceAllUsesWith(V);
-            Cmp->eraseFromParent();
-            return ReplaceCallWith(CI, 0);  // no uses.
-          }
-        }
-    }
-
-    // Get the length of the constant string operand
-    std::string Str;
-    if (!GetConstantStringInfo(Src, Str))
-      return false;
-      
-    // strlen("xyz") -> 3 (for example)
-    return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), Str.size()));
-  }
-} StrLenOptimizer;
-
-/// IsOnlyUsedInEqualsComparison - Return true if it only matters that the value
-/// is equal or not-equal to zero. 
-static bool IsOnlyUsedInEqualsZeroComparison(Instruction *I) {
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
-/// This memcmpOptimization will simplify a call to the memcmp library
-/// function.
-struct VISIBILITY_HIDDEN memcmpOptimization : public LibCallOptimization {
-  /// @brief Default Constructor
-  memcmpOptimization()
-    : LibCallOptimization("memcmp", "Number of 'memcmp' calls simplified") {}
-  
-  /// @brief Make sure that the "memcmp" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &TD) {
-    Function::const_arg_iterator AI = F->arg_begin();
-    if (F->arg_size() != 3 || !isa<PointerType>(AI->getType())) return false;
-    if (!isa<PointerType>((++AI)->getType())) return false;
-    if (!(++AI)->getType()->isInteger()) return false;
-    if (!F->getReturnType()->isInteger()) return false;
-    return true;
-  }
-  
-  /// Because of alignment and instruction information that we don't have, we
-  /// leave the bulk of this to the code generators.
-  ///
-  /// Note that we could do much more if we could force alignment on otherwise
-  /// small aligned allocas, or if we could indicate that loads have a small
-  /// alignment.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &TD) {
-    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
-
-    // If the two operands are the same, return zero.
-    if (LHS == RHS) {
-      // memcmp(s,s,x) -> 0
-      return ReplaceCallWith(CI, Constant::getNullValue(CI->getType()));
-    }
-    
-    // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
-    if (!LenC) return false;
-    uint64_t Len = LenC->getZExtValue();
-      
-    // If the length is zero, this returns 0.
-    switch (Len) {
-    case 0:
-      // memcmp(s1,s2,0) -> 0
-      return ReplaceCallWith(CI, Constant::getNullValue(CI->getType()));
-    case 1: {
-      // memcmp(S1,S2,1) -> *(ubyte*)S1 - *(ubyte*)S2
-      const Type *UCharPtr = PointerType::getUnqual(Type::Int8Ty);
-      CastInst *Op1Cast = CastInst::create(
-          Instruction::BitCast, LHS, UCharPtr, LHS->getName(), CI);
-      CastInst *Op2Cast = CastInst::create(
-          Instruction::BitCast, RHS, UCharPtr, RHS->getName(), CI);
-      Value *S1V = new LoadInst(Op1Cast, LHS->getName()+".val", CI);
-      Value *S2V = new LoadInst(Op2Cast, RHS->getName()+".val", CI);
-      Value *RV = BinaryOperator::createSub(S1V, S2V, CI->getName()+".diff",CI);
-      if (RV->getType() != CI->getType())
-        RV = CastInst::createIntegerCast(RV, CI->getType(), false, 
-                                         RV->getName(), CI);
-      return ReplaceCallWith(CI, RV);
-    }
-    case 2:
-      if (IsOnlyUsedInEqualsZeroComparison(CI)) {
-        // TODO: IF both are aligned, use a short load/compare.
-      
-        // memcmp(S1,S2,2) -> S1[0]-S2[0] | S1[1]-S2[1] iff only ==/!= 0 matters
-        const Type *UCharPtr = PointerType::getUnqual(Type::Int8Ty);
-        CastInst *Op1Cast = CastInst::create(
-            Instruction::BitCast, LHS, UCharPtr, LHS->getName(), CI);
-        CastInst *Op2Cast = CastInst::create(
-            Instruction::BitCast, RHS, UCharPtr, RHS->getName(), CI);
-        Value *S1V1 = new LoadInst(Op1Cast, LHS->getName()+".val1", CI);
-        Value *S2V1 = new LoadInst(Op2Cast, RHS->getName()+".val1", CI);
-        Value *D1 = BinaryOperator::createSub(S1V1, S2V1,
-                                              CI->getName()+".d1", CI);
-        Constant *One = ConstantInt::get(Type::Int32Ty, 1);
-        Value *G1 = GetElementPtrInst::Create(Op1Cast, One, "next1v", CI);
-        Value *G2 = GetElementPtrInst::Create(Op2Cast, One, "next2v", CI);
-        Value *S1V2 = new LoadInst(G1, LHS->getName()+".val2", CI);
-        Value *S2V2 = new LoadInst(G2, RHS->getName()+".val2", CI);
-        Value *D2 = BinaryOperator::createSub(S1V2, S2V2,
-                                              CI->getName()+".d1", CI);
-        Value *Or = BinaryOperator::createOr(D1, D2, CI->getName()+".res", CI);
-        if (Or->getType() != CI->getType())
-          Or = CastInst::createIntegerCast(Or, CI->getType(), false /*ZExt*/, 
-                                           Or->getName(), CI);
-        return ReplaceCallWith(CI, Or);
-      }
-      break;
-    default:
-      break;
-    }
-    
-    return false;
-  }
-} memcmpOptimizer;
-
-/// This LibCallOptimization will simplify a call to the memcpy library
-/// function.  It simply converts them into calls to llvm.memcpy.*;
-/// the resulting call should be optimized later.
-/// @brief Simplify the memcpy library function.
-struct VISIBILITY_HIDDEN MemCpyOptimization : public LibCallOptimization {
-public:
-  MemCpyOptimization() : LibCallOptimization("memcpy",
-      "Number of 'memcpy' calls simplified") {}
-
-  /// @brief Make sure that the "memcpy" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    const Type* voidPtr = PointerType::getUnqual(Type::Int8Ty);
-    return FT->getReturnType() == voidPtr && FT->getNumParams() == 3 &&
-           FT->getParamType(0) == voidPtr &&
-           FT->getParamType(1) == voidPtr &&
-           FT->getParamType(2) == SLC.getIntPtrType();
-  }
-
-  /// @brief Perform the memcpy optimization
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    Value *MemcpyOps[] = {
-      CI->getOperand(1), CI->getOperand(2), CI->getOperand(3),
-      ConstantInt::get(Type::Int32Ty, 1)   // align = 1 always.
-    };
-    CallInst::Create(SLC.get_memcpy(), MemcpyOps, MemcpyOps + 4, "", CI);
-    // memcpy always returns the destination
-    return ReplaceCallWith(CI, CI->getOperand(1));
-  }
-} MemCpyOptimizer;
-
-/// This LibCallOptimization will simplify a call to the memcpy library
-/// function by expanding it out to a single store of size 0, 1, 2, 4, or 8
-/// bytes depending on the length of the string and the alignment. Additional
-/// optimizations are possible in code generation (sequence of immediate store)
-/// @brief Simplify the memcpy library function.
-struct VISIBILITY_HIDDEN LLVMMemCpyMoveOptzn : public LibCallOptimization {
-  LLVMMemCpyMoveOptzn(const char* fname, const char* desc)
-  : LibCallOptimization(fname, desc) {}
-
-  /// @brief Make sure that the "memcpy" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function* f, SimplifyLibCalls& TD) {
-    // Just make sure this has 4 arguments per LLVM spec.
-    return (f->arg_size() == 4);
-  }
-
-  /// Because of alignment and instruction information that we don't have, we
-  /// leave the bulk of this to the code generators. The optimization here just
-  /// deals with a few degenerate cases where the length of the string and the
-  /// alignment match the sizes of our intrinsic types so we can do a load and
-  /// store instead of the memcpy call.
-  /// @brief Perform the memcpy optimization.
-  virtual bool OptimizeCall(CallInst* ci, SimplifyLibCalls& TD) {
-    // Make sure we have constant int values to work with
-    ConstantInt* LEN = dyn_cast<ConstantInt>(ci->getOperand(3));
-    if (!LEN)
-      return false;
-    ConstantInt* ALIGN = dyn_cast<ConstantInt>(ci->getOperand(4));
-    if (!ALIGN)
-      return false;
-
-    // If the length is larger than the alignment, we can't optimize
-    uint64_t len = LEN->getZExtValue();
-    uint64_t alignment = ALIGN->getZExtValue();
-    if (alignment == 0)
-      alignment = 1; // Alignment 0 is identity for alignment 1
-    if (len > alignment)
-      return false;
-
-    // Get the type we will cast to, based on size of the string
-    Value* dest = ci->getOperand(1);
-    Value* src = ci->getOperand(2);
-    const Type* castType = 0;
-    switch (len) {
-      case 0:
-        // memcpy(d,s,0,a) -> d
-        return ReplaceCallWith(ci, 0);
-      case 1: castType = Type::Int8Ty; break;
-      case 2: castType = Type::Int16Ty; break;
-      case 4: castType = Type::Int32Ty; break;
-      case 8: castType = Type::Int64Ty; break;
-      default:
-        return false;
-    }
-
-    // Cast source and dest to the right sized primitive and then load/store
-    CastInst* SrcCast = CastInst::create(Instruction::BitCast,
-        src, PointerType::getUnqual(castType), src->getName()+".cast", ci);
-    CastInst* DestCast = CastInst::create(Instruction::BitCast,
-        dest, PointerType::getUnqual(castType),dest->getName()+".cast", ci);
-    LoadInst* LI = new LoadInst(SrcCast,SrcCast->getName()+".val",ci);
-    new StoreInst(LI, DestCast, ci);
-    return ReplaceCallWith(ci, 0);
-  }
-};
-
-/// This LibCallOptimization will simplify a call to the memcpy/memmove library
-/// functions.
-LLVMMemCpyMoveOptzn LLVMMemCpyOptimizer32("llvm.memcpy.i32",
-                                    "Number of 'llvm.memcpy' calls simplified");
-LLVMMemCpyMoveOptzn LLVMMemCpyOptimizer64("llvm.memcpy.i64",
-                                   "Number of 'llvm.memcpy' calls simplified");
-LLVMMemCpyMoveOptzn LLVMMemMoveOptimizer32("llvm.memmove.i32",
-                                   "Number of 'llvm.memmove' calls simplified");
-LLVMMemCpyMoveOptzn LLVMMemMoveOptimizer64("llvm.memmove.i64",
-                                   "Number of 'llvm.memmove' calls simplified");
-
-/// This LibCallOptimization will simplify a call to the memset library
-/// function by expanding it out to a single store of size 0, 1, 2, 4, or 8
-/// bytes depending on the length argument.
-struct VISIBILITY_HIDDEN LLVMMemSetOptimization : public LibCallOptimization {
-  /// @brief Default Constructor
-  LLVMMemSetOptimization(const char *Name) : LibCallOptimization(Name,
-      "Number of 'llvm.memset' calls simplified") {}
-
-  /// @brief Make sure that the "memset" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &TD) {
-    // Just make sure this has 3 arguments per LLVM spec.
-    return F->arg_size() == 4;
-  }
-
-  /// Because of alignment and instruction information that we don't have, we
-  /// leave the bulk of this to the code generators. The optimization here just
-  /// deals with a few degenerate cases where the length parameter is constant
-  /// and the alignment matches the sizes of our intrinsic types so we can do
-  /// store instead of the memcpy call. Other calls are transformed into the
-  /// llvm.memset intrinsic.
-  /// @brief Perform the memset optimization.
-  virtual bool OptimizeCall(CallInst *ci, SimplifyLibCalls &TD) {
-    // Make sure we have constant int values to work with
-    ConstantInt* LEN = dyn_cast<ConstantInt>(ci->getOperand(3));
-    if (!LEN)
-      return false;
-    ConstantInt* ALIGN = dyn_cast<ConstantInt>(ci->getOperand(4));
-    if (!ALIGN)
-      return false;
-
-    // Extract the length and alignment
-    uint64_t len = LEN->getZExtValue();
-    uint64_t alignment = ALIGN->getZExtValue();
-
-    // Alignment 0 is identity for alignment 1
-    if (alignment == 0)
-      alignment = 1;
-
-    // If the length is zero, this is a no-op
-    if (len == 0) {
-      // memset(d,c,0,a) -> noop
-      return ReplaceCallWith(ci, 0);
-    }
-
-    // If the length is larger than the alignment, we can't optimize
-    if (len > alignment)
-      return false;
-
-    // Make sure we have a constant ubyte to work with so we can extract
-    // the value to be filled.
-    ConstantInt* FILL = dyn_cast<ConstantInt>(ci->getOperand(2));
-    if (!FILL)
-      return false;
-    if (FILL->getType() != Type::Int8Ty)
-      return false;
-
-    // memset(s,c,n) -> store s, c (for n=1,2,4,8)
-
-    // Extract the fill character
-    uint64_t fill_char = FILL->getZExtValue();
-    uint64_t fill_value = fill_char;
-
-    // Get the type we will cast to, based on size of memory area to fill, and
-    // and the value we will store there.
-    Value* dest = ci->getOperand(1);
-    const Type* castType = 0;
-    switch (len) {
-      case 1:
-        castType = Type::Int8Ty;
-        break;
-      case 2:
-        castType = Type::Int16Ty;
-        fill_value |= fill_char << 8;
-        break;
-      case 4:
-        castType = Type::Int32Ty;
-        fill_value |= fill_char << 8 | fill_char << 16 | fill_char << 24;
-        break;
-      case 8:
-        castType = Type::Int64Ty;
-        fill_value |= fill_char << 8 | fill_char << 16 | fill_char << 24;
-        fill_value |= fill_char << 32 | fill_char << 40 | fill_char << 48;
-        fill_value |= fill_char << 56;
-        break;
-      default:
-        return false;
-    }
-
-    // Cast dest to the right sized primitive and then load/store
-    CastInst* DestCast = new BitCastInst(dest, PointerType::getUnqual(castType), 
-                                         dest->getName()+".cast", ci);
-    new StoreInst(ConstantInt::get(castType,fill_value),DestCast, ci);
-    return ReplaceCallWith(ci, 0);
-  }
-};
-
-LLVMMemSetOptimization MemSet32Optimizer("llvm.memset.i32");
-LLVMMemSetOptimization MemSet64Optimizer("llvm.memset.i64");
-
-
-/// This LibCallOptimization will simplify calls to the "pow" library
-/// function. It looks for cases where the result of pow is well known and
-/// substitutes the appropriate value.
-/// @brief Simplify the pow library function.
-struct VISIBILITY_HIDDEN PowOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  PowOptimization(const char *Name) : LibCallOptimization(Name,
-      "Number of 'pow' calls simplified") {}
-
-  /// @brief Make sure that the "pow" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 && 
-          FT->getParamType(0) == FT->getParamType(1) &&
-          FT->getParamType(0) == FT->getReturnType() &&
-          FT->getParamType(0)->isFloatingPoint();
-  }
-
-  /// @brief Perform the pow optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    Value *Op1 = CI->getOperand(1);
-    Value *Op2 = CI->getOperand(2);
-    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0
-        return ReplaceCallWith(CI, Op1C);
-      if (Op1C->isExactlyValue(2.0)) {// pow(2.0, x) -> exp2(x)
-        Value *Exp2 = SLC.getUnaryFloatFunction("exp2", CI->getType());
-        Value *Res = CallInst::Create(Exp2, Op2, CI->getName()+"exp2", CI);
-        return ReplaceCallWith(CI, Res);
-      }
-    }
-    
-    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return false;
-    
-    if (Op2C->getValueAPF().isZero()) {
-      // pow(x, 0.0) -> 1.0
-      return ReplaceCallWith(CI, ConstantFP::get(CI->getType(), 1.0));
-    } else if (Op2C->isExactlyValue(0.5)) {
-      // FIXME: This is not safe for -0.0 and -inf.  This can only be done when
-      // 'unsafe' math optimizations are allowed.
-      // x    pow(x, 0.5)  sqrt(x)
-      // ---------------------------------------------
-      // -0.0    +0.0       -0.0
-      // -inf    +inf       NaN
-#if 0
-      // pow(x, 0.5) -> sqrt(x)
-      Value *Sqrt = CallInst::Create(SLC.get_sqrt(), Op1, "sqrt", CI);
-      return ReplaceCallWith(CI, Sqrt);
-#endif
-    } else if (Op2C->isExactlyValue(1.0)) {
-      // pow(x, 1.0) -> x
-      return ReplaceCallWith(CI, Op1);
-    } else if (Op2C->isExactlyValue(2.0)) {
-      // pow(x, 2.0) -> x*x
-      Value *Sq = BinaryOperator::createMul(Op1, Op1, "pow2", CI);
-      return ReplaceCallWith(CI, Sq);
-    } else if (Op2C->isExactlyValue(-1.0)) {
-      // pow(x, -1.0) -> 1.0/x
-      Value *R = BinaryOperator::createFDiv(ConstantFP::get(CI->getType(), 1.0),
-                                            Op1, CI->getName()+".pow", CI);
-      return ReplaceCallWith(CI, R);
-    }
-    return false; // opt failed
-  }
-};
-
-PowOptimization PowFOptimizer("powf");
-PowOptimization PowOptimizer("pow");
-PowOptimization PowLOptimizer("powl");
- 
-  
-/// This LibCallOptimization will simplify calls to the "printf" library
-/// function. It looks for cases where the result of printf is not used and the
-/// operation can be reduced to something simpler.
-/// @brief Simplify the printf library function.
-struct VISIBILITY_HIDDEN PrintfOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  PrintfOptimization() : LibCallOptimization("printf",
-      "Number of 'printf' calls simplified") {}
-
-  /// @brief Make sure that the "printf" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    // Just make sure this has at least 1 argument and returns an integer or
-    // void type.
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() >= 1 &&
-          (isa<IntegerType>(FT->getReturnType()) ||
-           FT->getReturnType() == Type::VoidTy);
-  }
-
-  /// @brief Perform the printf optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // All the optimizations depend on the length of the first argument and the
-    // fact that it is a constant string array. Check that now
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
-      return false;
-    
-    // If this is a simple constant string with no format specifiers that ends
-    // with a \n, turn it into a puts call.
-    if (FormatStr.empty()) {
-      // Tolerate printf's declared void.
-      if (CI->use_empty()) return ReplaceCallWith(CI, 0);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 0));
-    }
-    
-    if (FormatStr.size() == 1) {
-      // Turn this into a putchar call, even if it is a %.
-      Value *V = ConstantInt::get(Type::Int32Ty, FormatStr[0]);
-      CallInst::Create(SLC.get_putchar(), V, "", CI);
-      if (CI->use_empty()) return ReplaceCallWith(CI, 0);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 1));
-    }
-
-    // Check to see if the format str is something like "foo\n", in which case
-    // we convert it to a puts call.  We don't allow it to contain any format
-    // characters.
-    if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == std::string::npos) {
-      // Create a string literal with no \n on it.  We expect the constant merge
-      // pass to be run after this pass, to merge duplicate strings.
-      FormatStr.erase(FormatStr.end()-1);
-      Constant *Init = ConstantArray::get(FormatStr, true);
-      Constant *GV = new GlobalVariable(Init->getType(), true,
-                                        GlobalVariable::InternalLinkage,
-                                        Init, "str",
-                                     CI->getParent()->getParent()->getParent());
-      // Cast GV to be a pointer to char.
-      GV = ConstantExpr::getBitCast(GV, PointerType::getUnqual(Type::Int8Ty));
-      CallInst::Create(SLC.get_puts(), GV, "", CI);
-
-      if (CI->use_empty()) return ReplaceCallWith(CI, 0);
-      // The return value from printf includes the \n we just removed, so +1.
-      return ReplaceCallWith(CI,
-                             ConstantInt::get(CI->getType(), 
-                                              FormatStr.size()+1));
-    }
-    
-    
-    // Only support %c or "%s\n" for now.
-    if (FormatStr.size() < 2 || FormatStr[0] != '%')
-      return false;
-
-    // Get the second character and switch on its value
-    switch (FormatStr[1]) {
-    default:  return false;
-    case 's':
-      if (FormatStr != "%s\n" || CI->getNumOperands() < 3 ||
-          // TODO: could insert strlen call to compute string length.
-          !CI->use_empty())
-        return false;
-
-      // printf("%s\n",str) -> puts(str)
-      CallInst::Create(SLC.get_puts(), CastToCStr(CI->getOperand(2), CI),
-                       CI->getName(), CI);
-      return ReplaceCallWith(CI, 0);
-    case 'c': {
-      // printf("%c",c) -> putchar(c)
-      if (FormatStr.size() != 2 || CI->getNumOperands() < 3)
-        return false;
-      
-      Value *V = CI->getOperand(2);
-      if (!isa<IntegerType>(V->getType()) ||
-          cast<IntegerType>(V->getType())->getBitWidth() > 32)
-        return false;
-
-      V = CastInst::createZExtOrBitCast(V, Type::Int32Ty, CI->getName()+".int",
-                                        CI);
-      CallInst::Create(SLC.get_putchar(), V, "", CI);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 1));
-    }
-    }
-  }
-} PrintfOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "fprintf" library
-/// function. It looks for cases where the result of fprintf is not used and the
-/// operation can be reduced to something simpler.
-/// @brief Simplify the fprintf library function.
-struct VISIBILITY_HIDDEN FPrintFOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  FPrintFOptimization() : LibCallOptimization("fprintf",
-      "Number of 'fprintf' calls simplified") {}
-
-  /// @brief Make sure that the "fprintf" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 &&  // two fixed arguments.
-           FT->getParamType(1) == PointerType::getUnqual(Type::Int8Ty) &&
-           isa<PointerType>(FT->getParamType(0)) &&
-           isa<IntegerType>(FT->getReturnType());
-  }
-
-  /// @brief Perform the fprintf optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // If the call has more than 3 operands, we can't optimize it
-    if (CI->getNumOperands() != 3 && CI->getNumOperands() != 4)
-      return false;
-
-    // All the optimizations depend on the format string.
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
-      return false;
-
-    // If this is just a format string, turn it into fwrite.
-    if (CI->getNumOperands() == 3) {
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')
-          return false; // we found a format specifier
-
-      // fprintf(file,fmt) -> fwrite(fmt,strlen(fmt),file)
-      const Type *FILEty = CI->getOperand(1)->getType();
-
-      Value *FWriteArgs[] = {
-        CI->getOperand(2),
-        ConstantInt::get(SLC.getIntPtrType(), FormatStr.size()),
-        ConstantInt::get(SLC.getIntPtrType(), 1),
-        CI->getOperand(1)
-      };
-      CallInst::Create(SLC.get_fwrite(FILEty), FWriteArgs, FWriteArgs + 4, CI->getName(), CI);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 
-                                                  FormatStr.size()));
-    }
-    
-    // The remaining optimizations require the format string to be length 2:
-    // "%s" or "%c".
-    if (FormatStr.size() != 2 || FormatStr[0] != '%')
-      return false;
-
-    // Get the second character and switch on its value
-    switch (FormatStr[1]) {
-    case 'c': {
-      // fprintf(file,"%c",c) -> fputc(c,file)
-      const Type *FILETy = CI->getOperand(1)->getType();
-      Value *C = CastInst::createZExtOrBitCast(CI->getOperand(3), Type::Int32Ty,
-                                               CI->getName()+".int", CI);
-      SmallVector<Value *, 2> Args;
-      Args.push_back(C);
-      Args.push_back(CI->getOperand(1));
-      CallInst::Create(SLC.get_fputc(FILETy), Args.begin(), Args.end(), "", CI);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 1));
-    }
-    case 's': {
-      const Type *FILETy = CI->getOperand(1)->getType();
-      
-      // If the result of the fprintf call is used, we can't do this.
-      // TODO: we should insert a strlen call.
-      if (!CI->use_empty())
-        return false;
-      
-      // fprintf(file,"%s",str) -> fputs(str,file)
-      SmallVector<Value *, 2> Args;
-      Args.push_back(CastToCStr(CI->getOperand(3), CI));
-      Args.push_back(CI->getOperand(1));
-      CallInst::Create(SLC.get_fputs(FILETy), Args.begin(),
-                       Args.end(), CI->getName(), CI);
-      return ReplaceCallWith(CI, 0);
-    }
-    default:
-      return false;
-    }
-  }
-} FPrintFOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "sprintf" library
-/// function. It looks for cases where the result of sprintf is not used and the
-/// operation can be reduced to something simpler.
-/// @brief Simplify the sprintf library function.
-struct VISIBILITY_HIDDEN SPrintFOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  SPrintFOptimization() : LibCallOptimization("sprintf",
-      "Number of 'sprintf' calls simplified") {}
-
-  /// @brief Make sure that the "sprintf" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 2 &&  // two fixed arguments.
-           FT->getParamType(1) == PointerType::getUnqual(Type::Int8Ty) &&
-           FT->getParamType(0) == FT->getParamType(1) &&
-           isa<IntegerType>(FT->getReturnType());
-  }
-
-  /// @brief Perform the sprintf optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // If the call has more than 3 operands, we can't optimize it
-    if (CI->getNumOperands() != 3 && CI->getNumOperands() != 4)
-      return false;
-
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
-      return false;
-    
-    if (CI->getNumOperands() == 3) {
-      // Make sure there's no % in the constant array
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')
-          return false; // we found a format specifier
-      
-      // sprintf(str,fmt) -> llvm.memcpy(str,fmt,strlen(fmt),1)
-      Value *MemCpyArgs[] = {
-        CI->getOperand(1), CI->getOperand(2),
-        ConstantInt::get(SLC.getIntPtrType(), 
-                         FormatStr.size()+1), // Copy the nul byte.
-        ConstantInt::get(Type::Int32Ty, 1)
-      };
-      CallInst::Create(SLC.get_memcpy(), MemCpyArgs, MemCpyArgs + 4, "", CI);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 
-                                                  FormatStr.size()));
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c".
-    if (FormatStr.size() != 2 || FormatStr[0] != '%')
-      return false;
-
-    // Get the second character and switch on its value
-    switch (FormatStr[1]) {
-    case 'c': {
-      // sprintf(dest,"%c",chr) -> store chr, dest
-      Value *V = CastInst::createTruncOrBitCast(CI->getOperand(3),
-                                                Type::Int8Ty, "char", CI);
-      new StoreInst(V, CI->getOperand(1), CI);
-      Value *Ptr = GetElementPtrInst::Create(CI->getOperand(1),
-                                             ConstantInt::get(Type::Int32Ty, 1),
-                                             CI->getOperand(1)->getName()+".end",
-                                             CI);
-      new StoreInst(ConstantInt::get(Type::Int8Ty,0), Ptr, CI);
-      return ReplaceCallWith(CI, ConstantInt::get(Type::Int32Ty, 1));
-    }
-    case 's': {
-      // sprintf(dest,"%s",str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      Value *Len = CallInst::Create(SLC.get_strlen(),
-                                    CastToCStr(CI->getOperand(3), CI),
-                                    CI->getOperand(3)->getName()+".len", CI);
-      Value *UnincLen = Len;
-      Len = BinaryOperator::createAdd(Len, ConstantInt::get(Len->getType(), 1),
-                                      Len->getName()+"1", CI);
-      Value *MemcpyArgs[4] = {
-        CI->getOperand(1),
-        CastToCStr(CI->getOperand(3), CI),
-        Len,
-        ConstantInt::get(Type::Int32Ty, 1)
-      };
-      CallInst::Create(SLC.get_memcpy(), MemcpyArgs, MemcpyArgs + 4, "", CI);
-      
-      // The strlen result is the unincremented number of bytes in the string.
-      if (!CI->use_empty()) {
-        if (UnincLen->getType() != CI->getType())
-          UnincLen = CastInst::createIntegerCast(UnincLen, CI->getType(), false, 
-                                                 Len->getName(), CI);
-        CI->replaceAllUsesWith(UnincLen);
-      }
-      return ReplaceCallWith(CI, 0);
-    }
-    }
-    return false;
-  }
-} SPrintFOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "fputs" library
-/// function. It looks for cases where the result of fputs is not used and the
-/// operation can be reduced to something simpler.
-/// @brief Simplify the fputs library function.
-struct VISIBILITY_HIDDEN FPutsOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  FPutsOptimization() : LibCallOptimization("fputs",
-      "Number of 'fputs' calls simplified") {}
-
-  /// @brief Make sure that the "fputs" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    // Just make sure this has 2 arguments
-    return F->arg_size() == 2;
-  }
-
-  /// @brief Perform the fputs optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // If the result is used, none of these optimizations work.
-    if (!CI->use_empty())
-      return false;
-
-    // All the optimizations depend on the length of the first argument and the
-    // fact that it is a constant string array. Check that now
-    std::string Str;
-    if (!GetConstantStringInfo(CI->getOperand(1), Str))
-      return false;
-
-    const Type *FILETy = CI->getOperand(2)->getType();
-    // fputs(s,F)  -> fwrite(s,1,len,F) (if s is constant and strlen(s) > 1)
-    Value *FWriteParms[4] = {
-      CI->getOperand(1),
-      ConstantInt::get(SLC.getIntPtrType(), Str.size()),
-      ConstantInt::get(SLC.getIntPtrType(), 1),
-      CI->getOperand(2)
-    };
-    CallInst::Create(SLC.get_fwrite(FILETy), FWriteParms, FWriteParms + 4, "", CI);
-    return ReplaceCallWith(CI, 0);  // Known to have no uses (see above).
-  }
-} FPutsOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "fwrite" function.
-struct VISIBILITY_HIDDEN FWriteOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  FWriteOptimization() : LibCallOptimization("fwrite",
-                                       "Number of 'fwrite' calls simplified") {}
-  
-  /// @brief Make sure that the "fputs" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    const FunctionType *FT = F->getFunctionType();
-    return FT->getNumParams() == 4 && 
-           FT->getParamType(0) == PointerType::getUnqual(Type::Int8Ty) &&
-           FT->getParamType(1) == FT->getParamType(2) &&
-           isa<IntegerType>(FT->getParamType(1)) &&
-           isa<PointerType>(FT->getParamType(3)) &&
-           isa<IntegerType>(FT->getReturnType());
-  }
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // Get the element size and count.
-    uint64_t EltSize, EltCount;
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CI->getOperand(2)))
-      EltSize = C->getZExtValue();
-    else
-      return false;
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CI->getOperand(3)))
-      EltCount = C->getZExtValue();
-    else
-      return false;
-    
-    // If this is writing zero records, remove the call (it's a noop).
-    if (EltSize * EltCount == 0)
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 0));
-    
-    // If this is writing one byte, turn it into fputc.
-    if (EltSize == 1 && EltCount == 1) {
-      SmallVector<Value *, 2> Args;
-      // fwrite(s,1,1,F) -> fputc(s[0],F)
-      Value *Ptr = CI->getOperand(1);
-      Value *Val = new LoadInst(Ptr, Ptr->getName()+".byte", CI);
-      Args.push_back(new ZExtInst(Val, Type::Int32Ty, Val->getName()+".int", CI));
-      Args.push_back(CI->getOperand(4));
-      const Type *FILETy = CI->getOperand(4)->getType();
-      CallInst::Create(SLC.get_fputc(FILETy), Args.begin(), Args.end(), "", CI);
-      return ReplaceCallWith(CI, ConstantInt::get(CI->getType(), 1));
-    }
-    return false;
-  }
-} FWriteOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "isdigit" library
-/// function. It simply does range checks the parameter explicitly.
-/// @brief Simplify the isdigit library function.
-struct VISIBILITY_HIDDEN isdigitOptimization : public LibCallOptimization {
-public:
-  isdigitOptimization() : LibCallOptimization("isdigit",
-      "Number of 'isdigit' calls simplified") {}
-
-  /// @brief Make sure that the "isdigit" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function* f, SimplifyLibCalls& SLC){
-    // Just make sure this has 1 argument
-    return (f->arg_size() == 1);
-  }
-
-  /// @brief Perform the toascii optimization.
-  virtual bool OptimizeCall(CallInst *ci, SimplifyLibCalls &SLC) {
-    if (ConstantInt* CI = dyn_cast<ConstantInt>(ci->getOperand(1))) {
-      // isdigit(c)   -> 0 or 1, if 'c' is constant
-      uint64_t val = CI->getZExtValue();
-      if (val >= '0' && val <= '9')
-        return ReplaceCallWith(ci, ConstantInt::get(Type::Int32Ty, 1));
-      else
-        return ReplaceCallWith(ci, ConstantInt::get(Type::Int32Ty, 0));
-    }
-
-    // isdigit(c)   -> (unsigned)c - '0' <= 9
-    CastInst* cast = CastInst::createIntegerCast(ci->getOperand(1),
-        Type::Int32Ty, false/*ZExt*/, ci->getOperand(1)->getName()+".uint", ci);
-    BinaryOperator* sub_inst = BinaryOperator::createSub(cast,
-        ConstantInt::get(Type::Int32Ty,0x30),
-        ci->getOperand(1)->getName()+".sub",ci);
-    ICmpInst* setcond_inst = new ICmpInst(ICmpInst::ICMP_ULE,sub_inst,
-        ConstantInt::get(Type::Int32Ty,9),
-        ci->getOperand(1)->getName()+".cmp",ci);
-    CastInst* c2 = new ZExtInst(setcond_inst, Type::Int32Ty, 
-        ci->getOperand(1)->getName()+".isdigit", ci);
-    return ReplaceCallWith(ci, c2);
-  }
-} isdigitOptimizer;
-
-struct VISIBILITY_HIDDEN isasciiOptimization : public LibCallOptimization {
-public:
-  isasciiOptimization()
-    : LibCallOptimization("isascii", "Number of 'isascii' calls simplified") {}
-  
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    return F->arg_size() == 1 && F->arg_begin()->getType()->isInteger() && 
-           F->getReturnType()->isInteger();
-  }
-  
-  /// @brief Perform the isascii optimization.
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-    // isascii(c)   -> (unsigned)c < 128
-    Value *V = CI->getOperand(1);
-    Value *Cmp = new ICmpInst(ICmpInst::ICMP_ULT, V, 
-                              ConstantInt::get(V->getType(), 128), 
-                              V->getName()+".isascii", CI);
-    if (Cmp->getType() != CI->getType())
-      Cmp = new ZExtInst(Cmp, CI->getType(), Cmp->getName(), CI);
-    return ReplaceCallWith(CI, Cmp);
-  }
-} isasciiOptimizer;
-
-
-/// This LibCallOptimization will simplify calls to the "toascii" library
-/// function. It simply does the corresponding and operation to restrict the
-/// range of values to the ASCII character set (0-127).
-/// @brief Simplify the toascii library function.
-struct VISIBILITY_HIDDEN ToAsciiOptimization : public LibCallOptimization {
-public:
-  /// @brief Default Constructor
-  ToAsciiOptimization() : LibCallOptimization("toascii",
-      "Number of 'toascii' calls simplified") {}
-
-  /// @brief Make sure that the "fputs" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function* f, SimplifyLibCalls& SLC){
-    // Just make sure this has 2 arguments
-    return (f->arg_size() == 1);
-  }
-
-  /// @brief Perform the toascii optimization.
-  virtual bool OptimizeCall(CallInst *ci, SimplifyLibCalls &SLC) {
-    // toascii(c)   -> (c & 0x7f)
-    Value *chr = ci->getOperand(1);
-    Value *and_inst = BinaryOperator::createAnd(chr,
-        ConstantInt::get(chr->getType(),0x7F),ci->getName()+".toascii",ci);
-    return ReplaceCallWith(ci, and_inst);
-  }
-} ToAsciiOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "ffs" library
-/// calls which find the first set bit in an int, long, or long long. The
-/// optimization is to compute the result at compile time if the argument is
-/// a constant.
-/// @brief Simplify the ffs library function.
-struct VISIBILITY_HIDDEN FFSOptimization : public LibCallOptimization {
-protected:
-  /// @brief Subclass Constructor
-  FFSOptimization(const char* funcName, const char* description)
-    : LibCallOptimization(funcName, description) {}
-
-public:
-  /// @brief Default Constructor
-  FFSOptimization() : LibCallOptimization("ffs",
-      "Number of 'ffs' calls simplified") {}
-
-  /// @brief Make sure that the "ffs" function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    // Just make sure this has 2 arguments
-    return F->arg_size() == 1 && F->getReturnType() == Type::Int32Ty;
-  }
-
-  /// @brief Perform the ffs optimization.
-  virtual bool OptimizeCall(CallInst *TheCall, SimplifyLibCalls &SLC) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(TheCall->getOperand(1))) {
-      // ffs(cnst)  -> bit#
-      // ffsl(cnst) -> bit#
-      // ffsll(cnst) -> bit#
-      uint64_t val = CI->getZExtValue();
-      int result = 0;
-      if (val) {
-        ++result;
-        while ((val & 1) == 0) {
-          ++result;
-          val >>= 1;
-        }
-      }
-      return ReplaceCallWith(TheCall, ConstantInt::get(Type::Int32Ty, result));
-    }
-
-    // ffs(x)   -> x == 0 ? 0 : llvm.cttz(x)+1
-    // ffsl(x)  -> x == 0 ? 0 : llvm.cttz(x)+1
-    // ffsll(x) -> x == 0 ? 0 : llvm.cttz(x)+1
-    const Type *ArgType = TheCall->getOperand(1)->getType();
-    assert(ArgType->getTypeID() == Type::IntegerTyID &&
-           "llvm.cttz argument is not an integer?");
-    Constant *F = Intrinsic::getDeclaration(SLC.getModule(),
-                                            Intrinsic::cttz, &ArgType, 1);
-
-    Value *V = CastInst::createIntegerCast(TheCall->getOperand(1), ArgType, 
-                                           false/*ZExt*/, "tmp", TheCall);
-    Value *V2 = CallInst::Create(F, V, "tmp", TheCall);
-    V2 = CastInst::createIntegerCast(V2, Type::Int32Ty, false/*ZExt*/, 
-                                     "tmp", TheCall);
-    V2 = BinaryOperator::createAdd(V2, ConstantInt::get(Type::Int32Ty, 1),
-                                   "tmp", TheCall);
-    Value *Cond = new ICmpInst(ICmpInst::ICMP_EQ, V, 
-                               Constant::getNullValue(V->getType()), "tmp", 
-                               TheCall);
-    V2 = SelectInst::Create(Cond, ConstantInt::get(Type::Int32Ty, 0), V2,
-                            TheCall->getName(), TheCall);
-    return ReplaceCallWith(TheCall, V2);
-  }
-} FFSOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "ffsl" library
-/// calls. It simply uses FFSOptimization for which the transformation is
-/// identical.
-/// @brief Simplify the ffsl library function.
-struct VISIBILITY_HIDDEN FFSLOptimization : public FFSOptimization {
-public:
-  /// @brief Default Constructor
-  FFSLOptimization() : FFSOptimization("ffsl",
-      "Number of 'ffsl' calls simplified") {}
-
-} FFSLOptimizer;
-
-/// This LibCallOptimization will simplify calls to the "ffsll" library
-/// calls. It simply uses FFSOptimization for which the transformation is
-/// identical.
-/// @brief Simplify the ffsl library function.
-struct VISIBILITY_HIDDEN FFSLLOptimization : public FFSOptimization {
-public:
-  /// @brief Default Constructor
-  FFSLLOptimization() : FFSOptimization("ffsll",
-      "Number of 'ffsll' calls simplified") {}
-
-} FFSLLOptimizer;
-
-/// This optimizes unary functions that take and return doubles.
-struct UnaryDoubleFPOptimizer : public LibCallOptimization {
-  UnaryDoubleFPOptimizer(const char *Fn, const char *Desc)
-  : LibCallOptimization(Fn, Desc) {}
-  
-  // Make sure that this function has the right prototype
-  virtual bool ValidateCalledFunction(const Function *F, SimplifyLibCalls &SLC){
-    return F->arg_size() == 1 && F->arg_begin()->getType() == Type::DoubleTy &&
-           F->getReturnType() == Type::DoubleTy;
-  }
-
-  /// ShrinkFunctionToFloatVersion - If the input to this function is really a
-  /// float, strength reduce this to a float version of the function,
-  /// e.g. floor((double)FLT) -> (double)floorf(FLT).  This can only be called
-  /// when the target supports the destination function and where there can be
-  /// no precision loss.
-  static bool ShrinkFunctionToFloatVersion(CallInst *CI, SimplifyLibCalls &SLC,
-                                           Constant *(SimplifyLibCalls::*FP)()){
-    if (FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1)))
-      if (Cast->getOperand(0)->getType() == Type::FloatTy) {
-        Value *New = CallInst::Create((SLC.*FP)(), Cast->getOperand(0),
-                                      CI->getName(), CI);
-        New = new FPExtInst(New, Type::DoubleTy, CI->getName(), CI);
-        CI->replaceAllUsesWith(New);
-        CI->eraseFromParent();
-        if (Cast->use_empty())
-          Cast->eraseFromParent();
-        return true;
-      }
-    return false;
-  }
-};
-
-
-struct VISIBILITY_HIDDEN FloorOptimization : public UnaryDoubleFPOptimizer {
-  FloorOptimization()
-    : UnaryDoubleFPOptimizer("floor", "Number of 'floor' calls simplified") {}
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-#ifdef HAVE_FLOORF
-    // If this is a float argument passed in, convert to floorf.
-    if (ShrinkFunctionToFloatVersion(CI, SLC, &SimplifyLibCalls::get_floorf))
-      return true;
-#endif
-    return false; // opt failed
-  }
-} FloorOptimizer;
-
-struct VISIBILITY_HIDDEN CeilOptimization : public UnaryDoubleFPOptimizer {
-  CeilOptimization()
-  : UnaryDoubleFPOptimizer("ceil", "Number of 'ceil' calls simplified") {}
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-#ifdef HAVE_CEILF
-    // If this is a float argument passed in, convert to ceilf.
-    if (ShrinkFunctionToFloatVersion(CI, SLC, &SimplifyLibCalls::get_ceilf))
-      return true;
-#endif
-    return false; // opt failed
-  }
-} CeilOptimizer;
-
-struct VISIBILITY_HIDDEN RoundOptimization : public UnaryDoubleFPOptimizer {
-  RoundOptimization()
-  : UnaryDoubleFPOptimizer("round", "Number of 'round' calls simplified") {}
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-#ifdef HAVE_ROUNDF
-    // If this is a float argument passed in, convert to roundf.
-    if (ShrinkFunctionToFloatVersion(CI, SLC, &SimplifyLibCalls::get_roundf))
-      return true;
-#endif
-    return false; // opt failed
-  }
-} RoundOptimizer;
-
-struct VISIBILITY_HIDDEN RintOptimization : public UnaryDoubleFPOptimizer {
-  RintOptimization()
-  : UnaryDoubleFPOptimizer("rint", "Number of 'rint' calls simplified") {}
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-#ifdef HAVE_RINTF
-    // If this is a float argument passed in, convert to rintf.
-    if (ShrinkFunctionToFloatVersion(CI, SLC, &SimplifyLibCalls::get_rintf))
-      return true;
-#endif
-    return false; // opt failed
-  }
-} RintOptimizer;
-
-struct VISIBILITY_HIDDEN NearByIntOptimization : public UnaryDoubleFPOptimizer {
-  NearByIntOptimization()
-  : UnaryDoubleFPOptimizer("nearbyint",
-                           "Number of 'nearbyint' calls simplified") {}
-  
-  virtual bool OptimizeCall(CallInst *CI, SimplifyLibCalls &SLC) {
-#ifdef HAVE_NEARBYINTF
-    // If this is a float argument passed in, convert to nearbyintf.
-    if (ShrinkFunctionToFloatVersion(CI, SLC,&SimplifyLibCalls::get_nearbyintf))
-      return true;
-#endif
-    return false; // opt failed
-  }
-} NearByIntOptimizer;
-
-/// GetConstantStringInfo - This function computes the length of a
-/// null-terminated constant array of integers.  This function can't rely on the
-/// size of the constant array because there could be a null terminator in the
-/// middle of the array.
-///
-/// We also have to bail out if we find a non-integer constant initializer
-/// of one of the elements or if there is no null-terminator. The logic
-/// below checks each of these conditions and will return true only if all
-/// conditions are met.  If the conditions aren't met, this returns false.
-///
-/// If successful, the \p Array param is set to the constant array being
-/// indexed, the \p Length parameter is set to the length of the null-terminated
-/// string pointed to by V, the \p StartIdx value is set to the first
-/// element of the Array that V points to, and true is returned.
-static bool GetConstantStringInfo(Value *V, std::string &Str) {
-  // Look through noop bitcast instructions.
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
-    if (BCI->getType() == BCI->getOperand(0)->getType())
-      return GetConstantStringInfo(BCI->getOperand(0), Str);
-    return false;
-  }
-  
-  // If the value is not a GEP instruction nor a constant expression with a
-  // GEP instruction, then return false because ConstantArray can't occur
-  // any other way
-  User *GEP = 0;
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V)) {
-    GEP = GEPI;
-  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-    if (CE->getOpcode() != Instruction::GetElementPtr)
-      return false;
-    GEP = CE;
-  } else {
-    return false;
-  }
-
-  // Make sure the GEP has exactly three arguments.
-  if (GEP->getNumOperands() != 3)
-    return false;
-
-  // Check to make sure that the first operand of the GEP is an integer and
-  // has value 0 so that we are sure we're indexing into the initializer.
-  if (ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(1))) {
-    if (!Idx->isZero())
-      return false;
-  } else
-    return false;
-
-  // If the second index isn't a ConstantInt, then this is a variable index
-  // into the array.  If this occurs, we can't say anything meaningful about
-  // the string.
-  uint64_t StartIdx = 0;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
-    StartIdx = CI->getZExtValue();
-  else
-    return false;
-
-  // The GEP instruction, constant or instruction, must reference a global
-  // variable that is a constant and is initialized. The referenced constant
-  // initializer is the array that we'll use for optimization.
-  GlobalVariable* GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
-  if (!GV || !GV->isConstant() || !GV->hasInitializer())
-    return false;
-  Constant *GlobalInit = GV->getInitializer();
-
-  // Handle the ConstantAggregateZero case
-  if (isa<ConstantAggregateZero>(GlobalInit)) {
-    // This is a degenerate case. The initializer is constant zero so the
-    // length of the string must be zero.
-    Str.clear();
-    return true;
-  }
-
-  // Must be a Constant Array
-  ConstantArray *Array = dyn_cast<ConstantArray>(GlobalInit);
-  if (!Array) return false;
-
-  // Get the number of elements in the array
-  uint64_t NumElts = Array->getType()->getNumElements();
-
-  // Traverse the constant array from StartIdx (derived above) which is
-  // the place the GEP refers to in the array.
-  for (unsigned i = StartIdx; i < NumElts; ++i) {
-    Constant *Elt = Array->getOperand(i);
-    ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
-    if (!CI) // This array isn't suitable, non-int initializer.
-      return false;
-    if (CI->isZero())
-      return true; // we found end of string, success!
-    Str += (char)CI->getZExtValue();
-  }
-  
-  return false; // The array isn't null terminated.
-}
-
-/// CastToCStr - Return V if it is an sbyte*, otherwise cast it to sbyte*,
-/// inserting the cast before IP, and return the cast.
-/// @brief Cast a value to a "C" string.
-static Value *CastToCStr(Value *V, Instruction *IP) {
-  assert(isa<PointerType>(V->getType()) && 
-         "Can't cast non-pointer type to C string type");
-  const Type *SBPTy = PointerType::getUnqual(Type::Int8Ty);
-  if (V->getType() != SBPTy)
-    return new BitCastInst(V, SBPTy, V->getName(), IP);
-  return V;
-}
-
-// TODO:
-//   Additional cases that we need to add to this file:
-//
-// cbrt:
-//   * cbrt(expN(X))  -> expN(x/3)
-//   * cbrt(sqrt(x))  -> pow(x,1/6)
-//   * cbrt(sqrt(x))  -> pow(x,1/9)
-//
-// cos, cosf, cosl:
-//   * cos(-x)  -> cos(x)
-//
-// exp, expf, expl:
-//   * exp(log(x))  -> x
-//
-// log, logf, logl:
-//   * log(exp(x))   -> x
-//   * log(x**y)     -> y*log(x)
-//   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
-//   * log(exp10(y)) -> y*log(10)
-//   * log(sqrt(x))  -> 0.5*log(x)
-//   * log(pow(x,y)) -> y*log(x)
-//
-// lround, lroundf, lroundl:
-//   * lround(cnst) -> cnst'
-//
-// memcmp:
-//   * memcmp(x,y,l)   -> cnst
-//      (if all arguments are constant and strlen(x) <= l and strlen(y) <= l)
-//
-// memmove:
-//   * memmove(d,s,l,a) -> memcpy(d,s,l,a)
-//       (if s is a global constant array)
-//
-// pow, powf, powl:
-//   * pow(exp(x),y)  -> exp(x*y)
-//   * pow(sqrt(x),y) -> pow(x,y*0.5)
-//   * pow(pow(x,y),z)-> pow(x,y*z)
-//
-// puts:
-//   * puts("") -> putchar("\n")
-//
-// round, roundf, roundl:
-//   * round(cnst) -> cnst'
-//
-// signbit:
-//   * signbit(cnst) -> cnst'
-//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
-//
-// sqrt, sqrtf, sqrtl:
-//   * sqrt(expN(x))  -> expN(x*0.5)
-//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
-//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
-//
-// stpcpy:
-//   * stpcpy(str, "literal") ->
-//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
-// strrchr:
-//   * strrchr(s,c) -> reverse_offset_of_in(c,s)
-//      (if c is a constant integer and s is a constant string)
-//   * strrchr(s1,0) -> strchr(s1,0)
-//
-// strncat:
-//   * strncat(x,y,0) -> x
-//   * strncat(x,y,0) -> x (if strlen(y) = 0)
-//   * strncat(x,y,l) -> strcat(x,y) (if y and l are constants an l > strlen(y))
-//
-// strncpy:
-//   * strncpy(d,s,0) -> d
-//   * strncpy(d,s,l) -> memcpy(d,s,l,1)
-//      (if s and l are constants)
-//
-// strpbrk:
-//   * strpbrk(s,a) -> offset_in_for(s,a)
-//      (if s and a are both constant strings)
-//   * strpbrk(s,"") -> 0
-//   * strpbrk(s,a) -> strchr(s,a[0]) (if a is constant string of length 1)
-//
-// strspn, strcspn:
-//   * strspn(s,a)   -> const_int (if both args are constant)
-//   * strspn("",a)  -> 0
-//   * strspn(s,"")  -> 0
-//   * strcspn(s,a)  -> const_int (if both args are constant)
-//   * strcspn("",a) -> 0
-//   * strcspn(s,"") -> strlen(a)
-//
-// strstr:
-//   * strstr(x,x)  -> x
-//   * strstr(s1,s2) -> offset_of_s2_in(s1)
-//       (if s1 and s2 are constant strings)
-//
-// tan, tanf, tanl:
-//   * tan(atan(x)) -> x
-//
-// trunc, truncf, truncl:
-//   * trunc(cnst) -> cnst'
-//
-//
-}

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/StripDeadPrototypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/StripDeadPrototypes.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/StripDeadPrototypes.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/StripDeadPrototypes.cpp Sun Jul  6 15:45:41 2008
@@ -1,4 +1,4 @@
-//===-- StripDeadPrototypes.cpp - Removed unused function declarations ----===//
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,7 +8,9 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass loops over all of the functions in the input module, looking for 
-// dead declarations and removes them.
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
 //
 //===----------------------------------------------------------------------===//
 
@@ -32,12 +34,12 @@
   virtual bool runOnModule(Module &M);
 };
 
-char StripDeadPrototypesPass::ID = 0;
-RegisterPass<StripDeadPrototypesPass> X("strip-dead-prototypes", 
-                                        "Strip Unused Function Prototypes");
-
 } // end anonymous namespace
 
+char StripDeadPrototypesPass::ID = 0;
+static RegisterPass<StripDeadPrototypesPass>
+X("strip-dead-prototypes", "Strip Unused Function Prototypes");
+
 bool StripDeadPrototypesPass::runOnModule(Module &M) {
   bool MadeChange = false;
   

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/StripSymbols.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/StripSymbols.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/StripSymbols.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/StripSymbols.cpp Sun Jul  6 15:45:41 2008
@@ -46,11 +46,12 @@
       AU.setPreservesAll();
     }
   };
-
-  char StripSymbols::ID = 0;
-  RegisterPass<StripSymbols> X("strip", "Strip all symbols from a module");
 }
 
+char StripSymbols::ID = 0;
+static RegisterPass<StripSymbols>
+X("strip", "Strip all symbols from a module");
+
 ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
   return new StripSymbols(OnlyDebugInfo);
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/IPO/StructRetPromotion.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/IPO/StructRetPromotion.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/IPO/StructRetPromotion.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/IPO/StructRetPromotion.cpp Sun Jul  6 15:45:41 2008
@@ -1,4 +1,4 @@
-//===-- StructRetPromotion.cpp - Promote sret arguments -000000------------===//
+//===-- StructRetPromotion.cpp - Promote sret arguments ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO : Describe this pass.
+// This pass finds functions that return a struct (using a pointer to the struct
+// as the first argument of the function, marked with the 'sret' attribute) and
+// replaces them with a new function that simply returns each of the elements of
+// that struct (using multiple return values).
+//
+// This pass works under a number of conditions:
+//  1. The returned struct must not contain other structs
+//  2. The returned struct must only be used to load values from
+//  3. The placeholder struct passed in is the result of an alloca
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "sretpromotion"
@@ -49,12 +58,12 @@
     void updateCallSites(Function *F, Function *NF);
     bool nestedStructType(const StructType *STy);
   };
-
-  char SRETPromotion::ID = 0;
-  RegisterPass<SRETPromotion> X("sretpromotion",
-                               "Promote sret arguments to multiple ret values");
 }
 
+char SRETPromotion::ID = 0;
+static RegisterPass<SRETPromotion>
+X("sretpromotion", "Promote sret arguments to multiple ret values");
+
 Pass *llvm::createStructRetPromotionPass() {
   return new SRETPromotion();
 }
@@ -73,7 +82,7 @@
 bool SRETPromotion::PromoteReturn(CallGraphNode *CGN) {
   Function *F = CGN->getFunction();
 
-  if (!F || F->isDeclaration())
+  if (!F || F->isDeclaration() || !F->hasInternalLinkage())
     return false;
 
   // Make sure that function returns struct.
@@ -83,7 +92,7 @@
   assert (F->getReturnType() == Type::VoidTy && "Invalid function return type");
   Function::arg_iterator AI = F->arg_begin();
   const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType());
-  assert (FArgType && "Invalid sret paramater type");
+  assert (FArgType && "Invalid sret parameter type");
   const llvm::StructType *STy = 
     dyn_cast<StructType>(FArgType->getElementType());
   assert (STy && "Invalid sret parameter element type");
@@ -140,7 +149,7 @@
   return true;
 }
 
-  // Check if it is ok to perform this promotion.
+// Check if it is ok to perform this promotion.
 bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
 
   if (F->use_empty())
@@ -149,9 +158,17 @@
 
   for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end();
        FnUseI != FnUseE; ++FnUseI) {
+    // The function is passed in as an argument to (possibly) another function,
+    // we can't change it!
+    if (FnUseI.getOperandNo() != 0)
+      return false;
 
     CallSite CS = CallSite::get(*FnUseI);
     Instruction *Call = CS.getInstruction();
+    // The function is used by something else than a call or invoke instruction,
+    // we can't change it!
+    if (!Call)
+      return false;
     CallSite::arg_iterator AI = CS.arg_begin();
     Value *FirstArg = *AI;
 
@@ -223,7 +240,7 @@
 
   FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg());
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
-  NF->setCallingConv(F->getCallingConv());
+  NF->copyAttributesFrom(F);
   NF->setParamAttrs(PAListPtr::get(ParamAttrsVec.begin(), ParamAttrsVec.end()));
   F->getParent()->getFunctionList().insert(F, NF);
   NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
@@ -330,7 +347,7 @@
   unsigned Num = STy->getNumElements();
   for (unsigned i = 0; i < Num; i++) {
     const Type *Ty = STy->getElementType(i);
-    if (!Ty->isFirstClassType() && Ty != Type::VoidTy)
+    if (!Ty->isSingleValueType() && Ty != Type::VoidTy)
       return true;
   }
   return false;

Modified: llvm/branches/non-call-eh/lib/Transforms/Instrumentation/BlockProfiling.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Instrumentation/BlockProfiling.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Instrumentation/BlockProfiling.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Instrumentation/BlockProfiling.cpp Sun Jul  6 15:45:41 2008
@@ -36,14 +36,14 @@
     static char ID;
     bool runOnModule(Module &M);
   };
+}
 
-  char FunctionProfiler::ID = 0;
-
-  RegisterPass<FunctionProfiler> X("insert-function-profiling",
-                               "Insert instrumentation for function profiling");
-  RegisterAnalysisGroup<RSProfilers> XG(X);
+char FunctionProfiler::ID = 0;
 
-}
+static RegisterPass<FunctionProfiler>
+X("insert-function-profiling",
+  "Insert instrumentation for function profiling");
+static RegisterAnalysisGroup<RSProfilers> XG(X);
 
 ModulePass *llvm::createFunctionProfilerPass() {
   return new FunctionProfiler();
@@ -86,13 +86,13 @@
   public:
     static char ID;
   };
-
-  char BlockProfiler::ID = 0;
-  RegisterPass<BlockProfiler> Y("insert-block-profiling",
-                                "Insert instrumentation for block profiling");
-  RegisterAnalysisGroup<RSProfilers> YG(Y);
 }
 
+char BlockProfiler::ID = 0;
+static RegisterPass<BlockProfiler>
+Y("insert-block-profiling", "Insert instrumentation for block profiling");
+static RegisterAnalysisGroup<RSProfilers> YG(Y);
+
 ModulePass *llvm::createBlockProfilerPass() { return new BlockProfiler(); }
 
 bool BlockProfiler::runOnModule(Module &M) {

Modified: llvm/branches/non-call-eh/lib/Transforms/Instrumentation/EdgeProfiling.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Instrumentation/EdgeProfiling.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Instrumentation/EdgeProfiling.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Instrumentation/EdgeProfiling.cpp Sun Jul  6 15:45:41 2008
@@ -36,12 +36,12 @@
     static char ID; // Pass identification, replacement for typeid
     EdgeProfiler() : ModulePass((intptr_t)&ID) {}
   };
-
-  char EdgeProfiler::ID = 0;
-  RegisterPass<EdgeProfiler> X("insert-edge-profiling",
-                               "Insert instrumentation for edge profiling");
 }
 
+char EdgeProfiler::ID = 0;
+static RegisterPass<EdgeProfiler>
+X("insert-edge-profiling", "Insert instrumentation for edge profiling");
+
 ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
 
 bool EdgeProfiler::runOnModule(Module &M) {

Modified: llvm/branches/non-call-eh/lib/Transforms/Instrumentation/ProfilingUtils.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Instrumentation/ProfilingUtils.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Instrumentation/ProfilingUtils.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Instrumentation/ProfilingUtils.cpp Sun Jul  6 15:45:41 2008
@@ -68,7 +68,7 @@
       Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, 
                                                             false);
       InitCall->setOperand(2, 
-          CastInst::create(opcode, AI, ArgVTy, "argv.cast", InitCall));
+          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
     } else {
       InitCall->setOperand(2, AI);
     }
@@ -83,11 +83,11 @@
       if (!AI->use_empty()) {
         opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
         AI->replaceAllUsesWith(
-          CastInst::create(opcode, InitCall, AI->getType(), "", InsertPos));
+          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
       }
       opcode = CastInst::getCastOpcode(AI, true, Type::Int32Ty, true);
       InitCall->setOperand(1, 
-          CastInst::create(opcode, AI, Type::Int32Ty, "argc.cast", InitCall));
+          CastInst::Create(opcode, AI, Type::Int32Ty, "argc.cast", InitCall));
     } else {
       AI->replaceAllUsesWith(InitCall);
       InitCall->setOperand(1, AI);
@@ -100,8 +100,8 @@
 void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                    GlobalValue *CounterArray) {
   // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = BB->begin();
-  while (isa<AllocaInst>(InsertPos) || isa<PHINode>(InsertPos))
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
     ++InsertPos;
 
   // Create the getelementptr constant expression
@@ -113,7 +113,7 @@
 
   // Load, increment and store the value back.
   Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
-  Value *NewVal = BinaryOperator::create(Instruction::Add, OldVal,
+  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
                                          ConstantInt::get(Type::Int32Ty, 1),
                                          "NewFuncCounter", InsertPos);
   new StoreInst(NewVal, ElementPtr, InsertPos);

Modified: llvm/branches/non-call-eh/lib/Transforms/Instrumentation/RSProfiling.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Instrumentation/RSProfiling.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Instrumentation/RSProfiling.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Instrumentation/RSProfiling.cpp Sun Jul  6 15:45:41 2008
@@ -55,16 +55,18 @@
   enum RandomMeth {
     GBV, GBVO, HOSTCC
   };
+}
 
-  cl::opt<RandomMeth> RandomMethod("profile-randomness",
-      cl::desc("How to randomly choose to profile:"),
-      cl::values(
-                 clEnumValN(GBV, "global", "global counter"),
-                 clEnumValN(GBVO, "ra_global", 
-                            "register allocated global counter"),
-                 clEnumValN(HOSTCC, "rdcc", "cycle counter"),
-                 clEnumValEnd));
+static cl::opt<RandomMeth> RandomMethod("profile-randomness",
+    cl::desc("How to randomly choose to profile:"),
+    cl::values(
+               clEnumValN(GBV, "global", "global counter"),
+               clEnumValN(GBVO, "ra_global", 
+                          "register allocated global counter"),
+               clEnumValN(HOSTCC, "rdcc", "cycle counter"),
+               clEnumValEnd));
   
+namespace {
   /// NullProfilerRS - The basic profiler that does nothing.  It is the default
   /// profiler and thus terminates RSProfiler chains.  It is useful for 
   /// measuring framework overhead
@@ -81,12 +83,14 @@
       AU.setPreservesAll();
     }
   };
+}
 
-  static RegisterAnalysisGroup<RSProfilers> A("Profiling passes");
-  static RegisterPass<NullProfilerRS> NP("insert-null-profiling-rs",
-                                         "Measure profiling framework overhead");
-  static RegisterAnalysisGroup<RSProfilers, true> NPT(NP);
+static RegisterAnalysisGroup<RSProfilers> A("Profiling passes");
+static RegisterPass<NullProfilerRS> NP("insert-null-profiling-rs",
+                                       "Measure profiling framework overhead");
+static RegisterAnalysisGroup<RSProfilers, true> NPT(NP);
 
+namespace {
   /// Chooser - Something that chooses when to make a sample of the profiled code
   class VISIBILITY_HIDDEN Chooser {
   public:
@@ -158,11 +162,12 @@
     bool doInitialization(Module &M);
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
   };
-
-  RegisterPass<ProfilerRS> X("insert-rs-profiling-framework",
-                             "Insert random sampling instrumentation framework");
 }
 
+static RegisterPass<ProfilerRS>
+X("insert-rs-profiling-framework",
+  "Insert random sampling instrumentation framework");
+
 char RSProfilers::ID = 0;
 char NullProfilerRS::ID = 0;
 char ProfilerRS::ID = 0;
@@ -210,7 +215,7 @@
   ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
                              "countercc", t);
 
-  Value* nv = BinaryOperator::createSub(l, ConstantInt::get(T, 1),
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
                                         "counternew", t);
   new StoreInst(nv, Counter, t);
   t->setCondition(s);
@@ -260,14 +265,11 @@
         new StoreInst(l, Counter, bib);
         
         BasicBlock* bb = cast<InvokeInst>(bib)->getNormalDest();
-        BasicBlock::iterator i = bb->begin();
-        while (isa<PHINode>(i))
-          ++i;
+        BasicBlock::iterator i = bb->getFirstNonPHI();
         l = new LoadInst(Counter, "counter", i);
         
         bb = cast<InvokeInst>(bib)->getUnwindDest();
-        i = bb->begin();
-        while (isa<PHINode>(i)) ++i;
+        i = bb->getFirstNonPHI();
         l = new LoadInst(Counter, "counter", i);
         new StoreInst(l, AI, i);
       } else if (isa<UnwindInst>(&*bib) || isa<ReturnInst>(&*bib)) {
@@ -285,7 +287,7 @@
   ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
                              "countercc", t);
 
-  Value* nv = BinaryOperator::createSub(l, ConstantInt::get(T, 1),
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
                                         "counternew", t);
   new StoreInst(nv, AI, t);
   t->setCondition(s);
@@ -314,7 +316,7 @@
   
   CallInst* c = CallInst::Create(F, "rdcc", t);
   BinaryOperator* b = 
-    BinaryOperator::createAnd(c, ConstantInt::get(Type::Int64Ty, rm),
+    BinaryOperator::CreateAnd(c, ConstantInt::get(Type::Int64Ty, rm),
                               "mrdcc", t);
   
   ICmpInst *s = new ICmpInst(ICmpInst::ICMP_EQ, b,
@@ -338,8 +340,8 @@
 void RSProfilers_std::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                           GlobalValue *CounterArray) {
   // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = BB->begin();
-  while (isa<AllocaInst>(InsertPos) || isa<PHINode>(InsertPos))
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
     ++InsertPos;
   
   // Create the getelementptr constant expression
@@ -352,7 +354,7 @@
   // Load, increment and store the value back.
   Value *OldVal = new LoadInst(ElementPtr, "OldCounter", InsertPos);
   profcode.insert(OldVal);
-  Value *NewVal = BinaryOperator::createAdd(OldVal,
+  Value *NewVal = BinaryOperator::CreateAdd(OldVal,
                                             ConstantInt::get(Type::Int32Ty, 1),
                                             "NewCounter", InsertPos);
   profcode.insert(NewVal);
@@ -376,8 +378,8 @@
     if (bb == &bb->getParent()->getEntryBlock())
       TransCache[bb] = bb; //don't translate entry block
     else
-      TransCache[bb] = BasicBlock::Create("dup_" + bb->getName(), bb->getParent(), 
-                                          NULL);
+      TransCache[bb] = BasicBlock::Create("dup_" + bb->getName(),
+                                          bb->getParent(), NULL);
     return TransCache[bb];
   } else if (Instruction* i = dyn_cast<Instruction>(v)) {
     //we have already translated this

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/ADCE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/ADCE.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/ADCE.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/ADCE.cpp Sun Jul  6 15:45:41 2008
@@ -1,4 +1,4 @@
-//===- ADCE.cpp - Code to perform aggressive dead code elimination --------===//
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,464 +7,91 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements "aggressive" dead code elimination.  ADCE is DCe where
-// values are assumed to be dead until proven otherwise.  This is similar to
-// SCCP, except applied to the liveness of values.
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not 
+// catch, particularly involving loop computations.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "adce"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
+#include "llvm/BasicBlock.h"
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Compiler.h"
-#include <algorithm>
-using namespace llvm;
-
-STATISTIC(NumBlockRemoved, "Number of basic blocks removed");
-STATISTIC(NumInstRemoved , "Number of instructions removed");
-STATISTIC(NumCallRemoved , "Number of calls removed");
-
-namespace {
-//===----------------------------------------------------------------------===//
-// ADCE Class
-//
-// This class does all of the work of Aggressive Dead Code Elimination.
-// It's public interface consists of a constructor and a doADCE() method.
-//
-class VISIBILITY_HIDDEN ADCE : public FunctionPass {
-  Function *Func;                       // The function that we are working on
-  std::vector<Instruction*> WorkList;   // Instructions that just became live
-  std::set<Instruction*>    LiveSet;    // The set of live instructions
-
-  //===--------------------------------------------------------------------===//
-  // The public interface for this class
-  //
-public:
-  static char ID; // Pass identification, replacement for typeid
-  ADCE() : FunctionPass((intptr_t)&ID) {}
 
-  // Execute the Aggressive Dead Code Elimination Algorithm
-  //
-  virtual bool runOnFunction(Function &F) {
-    Func = &F;
-    bool Changed = doADCE();
-    assert(WorkList.empty());
-    LiveSet.clear();
-    return Changed;
-  }
-  // getAnalysisUsage - We require post dominance frontiers (aka Control
-  // Dependence Graph)
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    // We require that all function nodes are unified, because otherwise code
-    // can be marked live that wouldn't necessarily be otherwise.
-    AU.addRequired<UnifyFunctionExitNodes>();
-    AU.addRequired<AliasAnalysis>();
-    AU.addRequired<PostDominatorTree>();
-    AU.addRequired<PostDominanceFrontier>();
-  }
-
-
-  //===--------------------------------------------------------------------===//
-  // The implementation of this class
-  //
-private:
-  // doADCE() - Run the Aggressive Dead Code Elimination algorithm, returning
-  // true if the function was modified.
-  //
-  bool doADCE();
-
-  void markBlockAlive(BasicBlock *BB);
-
-
-  // deleteDeadInstructionsInLiveBlock - Loop over all of the instructions in
-  // the specified basic block, deleting ones that are dead according to
-  // LiveSet.
-  bool deleteDeadInstructionsInLiveBlock(BasicBlock *BB);
-
-  TerminatorInst *convertToUnconditionalBranch(TerminatorInst *TI);
-
-  inline void markInstructionLive(Instruction *I) {
-    if (!LiveSet.insert(I).second) return;
-    DOUT << "Insn Live: " << *I;
-    WorkList.push_back(I);
-  }
 
-  inline void markTerminatorLive(const BasicBlock *BB) {
-    DOUT << "Terminator Live: " << *BB->getTerminator();
-    markInstructionLive(const_cast<TerminatorInst*>(BB->getTerminator()));
-  }
-};
-
-  char ADCE::ID = 0;
-  RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
-} // End of anonymous namespace
-
-FunctionPass *llvm::createAggressiveDCEPass() { return new ADCE(); }
-
-void ADCE::markBlockAlive(BasicBlock *BB) {
-  // Mark the basic block as being newly ALIVE... and mark all branches that
-  // this block is control dependent on as being alive also...
-  //
-  PostDominanceFrontier &CDG = getAnalysis<PostDominanceFrontier>();
-
-  PostDominanceFrontier::const_iterator It = CDG.find(BB);
-  if (It != CDG.end()) {
-    // Get the blocks that this node is control dependent on...
-    const PostDominanceFrontier::DomSetType &CDB = It->second;
-    for (PostDominanceFrontier::DomSetType::const_iterator I =
-           CDB.begin(), E = CDB.end(); I != E; ++I)
-      markTerminatorLive(*I);   // Mark all their terminators as live
-  }
-
-  // If this basic block is live, and it ends in an unconditional branch, then
-  // the branch is alive as well...
-  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
-    if (BI->isUnconditional())
-      markTerminatorLive(BB);
-}
+using namespace llvm;
 
-// deleteDeadInstructionsInLiveBlock - Loop over all of the instructions in the
-// specified basic block, deleting ones that are dead according to LiveSet.
-bool ADCE::deleteDeadInstructionsInLiveBlock(BasicBlock *BB) {
-  bool Changed = false;
-  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; ) {
-    Instruction *I = II++;
-    if (!LiveSet.count(I)) {              // Is this instruction alive?
-      if (!I->use_empty())
-        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+STATISTIC(NumRemoved, "Number of instructions removed");
 
-      // Nope... remove the instruction from it's basic block...
-      if (isa<CallInst>(I))
-        ++NumCallRemoved;
-      else
-        ++NumInstRemoved;
-      BB->getInstList().erase(I);
-      Changed = true;
+namespace {
+  struct VISIBILITY_HIDDEN ADCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ADCE() : FunctionPass((intptr_t)&ID) {}
+    
+    virtual bool runOnFunction(Function& F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.setPreservesCFG();
     }
-  }
-  return Changed;
+    
+  };
 }
 
+char ADCE::ID = 0;
+static RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
 
-/// convertToUnconditionalBranch - Transform this conditional terminator
-/// instruction into an unconditional branch because we don't care which of the
-/// successors it goes to.  This eliminate a use of the condition as well.
-///
-TerminatorInst *ADCE::convertToUnconditionalBranch(TerminatorInst *TI) {
-  BranchInst *NB = BranchInst::Create(TI->getSuccessor(0), TI);
-  BasicBlock *BB = TI->getParent();
-
-  // Remove entries from PHI nodes to avoid confusing ourself later...
-  for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-    TI->getSuccessor(i)->removePredecessor(BB);
-
-  // Delete the old branch itself...
-  BB->getInstList().erase(TI);
-  return NB;
-}
-
-
-// doADCE() - Run the Aggressive Dead Code Elimination algorithm, returning
-// true if the function was modified.
-//
-bool ADCE::doADCE() {
-  bool MadeChanges = false;
-
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-
-  // Iterate over all of the instructions in the function, eliminating trivially
-  // dead instructions, and marking instructions live that are known to be
-  // needed.  Perform the walk in depth first order so that we avoid marking any
-  // instructions live in basic blocks that are unreachable.  These blocks will
-  // be eliminated later, along with the instructions inside.
-  //
-  std::set<BasicBlock*> ReachableBBs;
-  for (df_ext_iterator<BasicBlock*>
-         BBI = df_ext_begin(&Func->front(), ReachableBBs),
-         BBE = df_ext_end(&Func->front(), ReachableBBs); BBI != BBE; ++BBI) {
-    BasicBlock *BB = *BBI;
-    for (BasicBlock::iterator II = BB->begin(), EI = BB->end(); II != EI; ) {
-      Instruction *I = II++;
-      if (CallInst *CI = dyn_cast<CallInst>(I)) {
-        if (AA.onlyReadsMemory(CI)) {
-          if (CI->use_empty()) {
-            BB->getInstList().erase(CI);
-            ++NumCallRemoved;
-          }
-        } else {
-          markInstructionLive(I);
-        }
-      } else if (I->mayWriteToMemory() || isa<ReturnInst>(I) ||
-                 isa<UnwindInst>(I) || isa<UnreachableInst>(I)) {
-        // FIXME: Unreachable instructions should not be marked intrinsically
-        // live here.
-        markInstructionLive(I);
-      } else if (isInstructionTriviallyDead(I)) {
-        // Remove the instruction from it's basic block...
-        BB->getInstList().erase(I);
-        ++NumInstRemoved;
-      }
-    }
-  }
-
-  // Check to ensure we have an exit node for this CFG.  If we don't, we won't
-  // have any post-dominance information, thus we cannot perform our
-  // transformations safely.
-  //
-  PostDominatorTree &DT = getAnalysis<PostDominatorTree>();
-  if (DT[&Func->getEntryBlock()] == 0) {
-    WorkList.clear();
-    return MadeChanges;
-  }
-
-  // Scan the function marking blocks without post-dominance information as
-  // live.  Blocks without post-dominance information occur when there is an
-  // infinite loop in the program.  Because the infinite loop could contain a
-  // function which unwinds, exits or has side-effects, we don't want to delete
-  // the infinite loop or those blocks leading up to it.
-  for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I)
-    if (DT[I] == 0 && ReachableBBs.count(I))
-      for (pred_iterator PI = pred_begin(I), E = pred_end(I); PI != E; ++PI)
-        markInstructionLive((*PI)->getTerminator());
-
-  DOUT << "Processing work list\n";
-
-  // AliveBlocks - Set of basic blocks that we know have instructions that are
-  // alive in them...
-  //
-  std::set<BasicBlock*> AliveBlocks;
-
-  // Process the work list of instructions that just became live... if they
-  // became live, then that means that all of their operands are necessary as
-  // well... make them live as well.
-  //
-  while (!WorkList.empty()) {
-    Instruction *I = WorkList.back(); // Get an instruction that became live...
-    WorkList.pop_back();
-
-    BasicBlock *BB = I->getParent();
-    if (!ReachableBBs.count(BB)) continue;
-    if (AliveBlocks.insert(BB).second)     // Basic block not alive yet.
-      markBlockAlive(BB);             // Make it so now!
-
-    // PHI nodes are a special case, because the incoming values are actually
-    // defined in the predecessor nodes of this block, meaning that the PHI
-    // makes the predecessors alive.
-    //
-    if (PHINode *PN = dyn_cast<PHINode>(I)) {
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        // If the incoming edge is clearly dead, it won't have control
-        // dependence information.  Do not mark it live.
-        BasicBlock *PredBB = PN->getIncomingBlock(i);
-        if (ReachableBBs.count(PredBB)) {
-          // FIXME: This should mark the control dependent edge as live, not
-          // necessarily the predecessor itself!
-          if (AliveBlocks.insert(PredBB).second)
-            markBlockAlive(PN->getIncomingBlock(i));   // Block is newly ALIVE!
-          if (Instruction *Op = dyn_cast<Instruction>(PN->getIncomingValue(i)))
-            markInstructionLive(Op);
-        }
-      }
-    } else {
-      // Loop over all of the operands of the live instruction, making sure that
-      // they are known to be alive as well.
-      //
-      for (unsigned op = 0, End = I->getNumOperands(); op != End; ++op)
-        if (Instruction *Operand = dyn_cast<Instruction>(I->getOperand(op)))
-          markInstructionLive(Operand);
-    }
-  }
-
-  DEBUG(
-    DOUT << "Current Function: X = Live\n";
-    for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I){
-      DOUT << I->getName() << ":\t"
-           << (AliveBlocks.count(I) ? "LIVE\n" : "DEAD\n");
-      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE; ++BI){
-        if (LiveSet.count(BI)) DOUT << "X ";
-        DOUT << *BI;
-      }
-    });
-
-  // All blocks being live is a common case, handle it specially.
-  if (AliveBlocks.size() == Func->size()) {  // No dead blocks?
-    for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I) {
-      // Loop over all of the instructions in the function deleting instructions
-      // to drop their references.
-      deleteDeadInstructionsInLiveBlock(I);
-
-      // Check to make sure the terminator instruction is live.  If it isn't,
-      // this means that the condition that it branches on (we know it is not an
-      // unconditional branch), is not needed to make the decision of where to
-      // go to, because all outgoing edges go to the same place.  We must remove
-      // the use of the condition (because it's probably dead), so we convert
-      // the terminator to an unconditional branch.
-      //
-      TerminatorInst *TI = I->getTerminator();
-      if (!LiveSet.count(TI))
-        convertToUnconditionalBranch(TI);
-    }
-
-    return MadeChanges;
-  }
-
-
-  // If the entry node is dead, insert a new entry node to eliminate the entry
-  // node as a special case.
-  //
-  if (!AliveBlocks.count(&Func->front())) {
-    BasicBlock *NewEntry = BasicBlock::Create();
-    BranchInst::Create(&Func->front(), NewEntry);
-    Func->getBasicBlockList().push_front(NewEntry);
-    AliveBlocks.insert(NewEntry);    // This block is always alive!
-    LiveSet.insert(NewEntry->getTerminator());  // The branch is live
+bool ADCE::runOnFunction(Function& F) {
+  SmallPtrSet<Instruction*, 128> alive;
+  SmallVector<Instruction*, 128> worklist;
+  
+  // Collect the set of "root" instructions that are known live.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isa<TerminatorInst>(I.getInstructionIterator()) ||
+        I->mayWriteToMemory()) {
+      alive.insert(I.getInstructionIterator());
+      worklist.push_back(I.getInstructionIterator());
+    }
+  
+  // Propagate liveness backwards to operands.
+  while (!worklist.empty()) {
+    Instruction* curr = worklist.back();
+    worklist.pop_back();
+    
+    for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
+         OI != OE; ++OI)
+      if (Instruction* Inst = dyn_cast<Instruction>(OI))
+        if (alive.insert(Inst))
+          worklist.push_back(Inst);
+  }
+  
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the worklist vector here for memory efficiency.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (!alive.count(I.getInstructionIterator())) {
+      worklist.push_back(I.getInstructionIterator());
+      I->dropAllReferences();
+    }
+  
+  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+       E = worklist.end(); I != E; ++I) {
+    NumRemoved++;
+    (*I)->eraseFromParent();
   }
+    
+  return !worklist.empty();
+}
 
-  // Loop over all of the alive blocks in the function.  If any successor
-  // blocks are not alive, we adjust the outgoing branches to branch to the
-  // first live postdominator of the live block, adjusting any PHI nodes in
-  // the block to reflect this.
-  //
-  for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I)
-    if (AliveBlocks.count(I)) {
-      BasicBlock *BB = I;
-      TerminatorInst *TI = BB->getTerminator();
-
-      // If the terminator instruction is alive, but the block it is contained
-      // in IS alive, this means that this terminator is a conditional branch on
-      // a condition that doesn't matter.  Make it an unconditional branch to
-      // ONE of the successors.  This has the side effect of dropping a use of
-      // the conditional value, which may also be dead.
-      if (!LiveSet.count(TI))
-        TI = convertToUnconditionalBranch(TI);
-
-      // Loop over all of the successors, looking for ones that are not alive.
-      // We cannot save the number of successors in the terminator instruction
-      // here because we may remove them if we don't have a postdominator.
-      //
-      for (unsigned i = 0; i != TI->getNumSuccessors(); ++i)
-        if (!AliveBlocks.count(TI->getSuccessor(i))) {
-          // Scan up the postdominator tree, looking for the first
-          // postdominator that is alive, and the last postdominator that is
-          // dead...
-          //
-          DomTreeNode *LastNode = DT[TI->getSuccessor(i)];
-          DomTreeNode *NextNode = 0;
-
-          if (LastNode) {
-            NextNode = LastNode->getIDom();
-            while (!AliveBlocks.count(NextNode->getBlock())) {
-              LastNode = NextNode;
-              NextNode = NextNode->getIDom();
-              if (NextNode == 0) {
-                LastNode = 0;
-                break;
-              }
-            }
-          }
-
-          // There is a special case here... if there IS no post-dominator for
-          // the block we have nowhere to point our branch to.  Instead, convert
-          // it to a return.  This can only happen if the code branched into an
-          // infinite loop.  Note that this may not be desirable, because we
-          // _are_ altering the behavior of the code.  This is a well known
-          // drawback of ADCE, so in the future if we choose to revisit the
-          // decision, this is where it should be.
-          //
-          if (LastNode == 0) {        // No postdominator!
-            if (!isa<InvokeInst>(TI)) {
-              // Call RemoveSuccessor to transmogrify the terminator instruction
-              // to not contain the outgoing branch, or to create a new
-              // terminator if the form fundamentally changes (i.e.,
-              // unconditional branch to return).  Note that this will change a
-              // branch into an infinite loop into a return instruction!
-              //
-              RemoveSuccessor(TI, i);
-
-              // RemoveSuccessor may replace TI... make sure we have a fresh
-              // pointer.
-              //
-              TI = BB->getTerminator();
-
-              // Rescan this successor...
-              --i;
-            } else {
-
-            }
-          } else {
-            // Get the basic blocks that we need...
-            BasicBlock *LastDead = LastNode->getBlock();
-            BasicBlock *NextAlive = NextNode->getBlock();
-
-            // Make the conditional branch now go to the next alive block...
-            TI->getSuccessor(i)->removePredecessor(BB);
-            TI->setSuccessor(i, NextAlive);
-
-            // If there are PHI nodes in NextAlive, we need to add entries to
-            // the PHI nodes for the new incoming edge.  The incoming values
-            // should be identical to the incoming values for LastDead.
-            //
-            for (BasicBlock::iterator II = NextAlive->begin();
-                 isa<PHINode>(II); ++II) {
-              PHINode *PN = cast<PHINode>(II);
-              if (LiveSet.count(PN)) {  // Only modify live phi nodes
-                // Get the incoming value for LastDead...
-                int OldIdx = PN->getBasicBlockIndex(LastDead);
-                assert(OldIdx != -1 &&"LastDead is not a pred of NextAlive!");
-                Value *InVal = PN->getIncomingValue(OldIdx);
-
-                // Add an incoming value for BB now...
-                PN->addIncoming(InVal, BB);
-              }
-            }
-          }
-        }
-
-      // Now loop over all of the instructions in the basic block, deleting
-      // dead instructions.  This is so that the next sweep over the program
-      // can safely delete dead instructions without other dead instructions
-      // still referring to them.
-      //
-      deleteDeadInstructionsInLiveBlock(BB);
-    }
-
-  // Loop over all of the basic blocks in the function, dropping references of
-  // the dead basic blocks.  We must do this after the previous step to avoid
-  // dropping references to PHIs which still have entries...
-  //
-  std::vector<BasicBlock*> DeadBlocks;
-  for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E; ++BB)
-    if (!AliveBlocks.count(BB)) {
-      // Remove PHI node entries for this block in live successor blocks.
-      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
-        if (!SI->empty() && isa<PHINode>(SI->front()) && AliveBlocks.count(*SI))
-          (*SI)->removePredecessor(BB);
-
-      BB->dropAllReferences();
-      MadeChanges = true;
-      DeadBlocks.push_back(BB);
-    }
-
-  NumBlockRemoved += DeadBlocks.size();
-
-  // Now loop through all of the blocks and delete the dead ones.  We can safely
-  // do this now because we know that there are no references to dead blocks
-  // (because they have dropped all of their references).
-  for (std::vector<BasicBlock*>::iterator I = DeadBlocks.begin(),
-         E = DeadBlocks.end(); I != E; ++I)
-    Func->getBasicBlockList().erase(*I);
-
-  return MadeChanges;
+FunctionPass *llvm::createAggressiveDCEPass() {
+  return new ADCE();
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/BasicBlockPlacement.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/BasicBlockPlacement.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/BasicBlockPlacement.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/BasicBlockPlacement.cpp Sun Jul  6 15:45:41 2008
@@ -72,12 +72,12 @@
     /// successors.
     void PlaceBlocks(BasicBlock *BB);
   };
-
-  char BlockPlacement::ID = 0;
-  RegisterPass<BlockPlacement> X("block-placement",
-                                 "Profile Guided Basic Block Placement");
 }
 
+char BlockPlacement::ID = 0;
+static RegisterPass<BlockPlacement>
+X("block-placement", "Profile Guided Basic Block Placement");
+
 FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
 
 bool BlockPlacement::runOnFunction(Function &F) {

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/CodeGenPrepare.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/CodeGenPrepare.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/CodeGenPrepare.cpp Sun Jul  6 15:45:41 2008
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass munges the code in the input function to better prepare it for
-// SelectionDAG-based code generation.  This works around limitations in it's
-// basic-block-at-a-time approach.  It should eventually be removed.
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
 //
 //===----------------------------------------------------------------------===//
 
@@ -333,16 +333,16 @@
 /// Return true if any changes are made.
 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
   // If this is a noop copy, 
-  MVT::ValueType SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
-  MVT::ValueType DstVT = TLI.getValueType(CI->getType());
+  MVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(CI->getType());
   
   // This is an fp<->int conversion?
-  if (MVT::isInteger(SrcVT) != MVT::isInteger(DstVT))
+  if (SrcVT.isInteger() != DstVT.isInteger())
     return false;
-  
+
   // If this is an extension, it will be a zero or sign extension, which
   // isn't a noop.
-  if (SrcVT < DstVT) return false;
+  if (SrcVT.bitsLT(DstVT)) return false;
   
   // If these values will be promoted, find out what they will be promoted
   // to.  This helps us consider truncates on PPC as noop copies when they
@@ -385,11 +385,10 @@
     CastInst *&InsertedCast = InsertedCasts[UserBB];
 
     if (!InsertedCast) {
-      BasicBlock::iterator InsertPt = UserBB->begin();
-      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
       
       InsertedCast = 
-        CastInst::create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", 
+        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", 
                          InsertPt);
       MadeChange = true;
     }
@@ -443,11 +442,10 @@
     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
 
     if (!InsertedCmp) {
-      BasicBlock::iterator InsertPt = UserBB->begin();
-      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
       
       InsertedCmp = 
-        CmpInst::create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), 
+        CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), 
                         CI->getOperand(1), "", InsertPt);
       MadeChange = true;
     }
@@ -483,6 +481,7 @@
   }
 }
 
+namespace {
 
 /// ExtAddrMode - This is an extended version of TargetLowering::AddrMode which
 /// holds actual Value*'s for register values.
@@ -517,6 +516,8 @@
   cerr << *this << "\n";
 }
 
+}
+
 static bool TryMatchingScaledValue(Value *ScaleReg, int64_t Scale,
                                    const Type *AccessTy, ExtAddrMode &AddrMode,
                                    SmallVector<Instruction*, 16> &AddrModeInsts,
@@ -877,7 +878,7 @@
         V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
       }
       if (AddrMode.Scale != 1)
-        V = BinaryOperator::createMul(V, ConstantInt::get(IntPtrTy,
+        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
                                                           AddrMode.Scale),
                                       "sunkaddr", InsertPt);
       Result = V;
@@ -889,7 +890,7 @@
       if (V->getType() != IntPtrTy)
         V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
       if (Result)
-        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
       else
         Result = V;
     }
@@ -899,7 +900,7 @@
       Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
                                   InsertPt);
       if (Result)
-        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
       else
         Result = V;
     }
@@ -908,7 +909,7 @@
     if (AddrMode.BaseOffs) {
       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
       if (Result)
-        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
       else
         Result = V;
     }
@@ -962,7 +963,7 @@
     }
 
     // Compute the constraint code and ConstraintType to use.
-    OpInfo.ComputeConstraintToUse(*TLI);
+    TLI->ComputeConstraintToUse(OpInfo, SDOperand());
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.isIndirect) {
@@ -1036,8 +1037,7 @@
     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
 
     if (!InsertedTrunc) {
-      BasicBlock::iterator InsertPt = UserBB->begin();
-      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
       
       InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
     }
@@ -1127,15 +1127,6 @@
             // Sink address computing for memory operands into the block.
             MadeChange |= OptimizeInlineAsmInst(I, &(*CI), SunkAddrs);
         }
-    } else if (GetResultInst *GRI = dyn_cast<GetResultInst>(I)) {
-      // Ensure that all getresult instructions live in the same basic block
-      // as their associated struct-value instructions. Codegen requires
-      // this, as lowering only works on one basic block at a time.
-      if (Instruction *Agg = dyn_cast<Instruction>(GRI->getAggregateValue())) {
-        BasicBlock *AggBB = Agg->getParent();
-        if (AggBB != GRI->getParent())
-          GRI->moveBefore(AggBB->getTerminator());
-      }
     }
   }
     

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/CondPropagate.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/CondPropagate.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/CondPropagate.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/CondPropagate.cpp Sun Jul  6 15:45:41 2008
@@ -48,10 +48,10 @@
     void SimplifyPredecessors(SwitchInst *SI);
     void RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB);
   };
-  
-  char CondProp::ID = 0;
-  RegisterPass<CondProp> X("condprop", "Conditional Propagation");
 }
+  
+char CondProp::ID = 0;
+static RegisterPass<CondProp> X("condprop", "Conditional Propagation");
 
 FunctionPass *llvm::createCondPropagationPass() {
   return new CondProp();

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/ConstantProp.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/ConstantProp.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/ConstantProp.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/ConstantProp.cpp Sun Jul  6 15:45:41 2008
@@ -43,12 +43,12 @@
       AU.setPreservesCFG();
     }
   };
-
-  char ConstantPropagation::ID = 0;
-  RegisterPass<ConstantPropagation> X("constprop",
-                                      "Simple constant propagation");
 }
 
+char ConstantPropagation::ID = 0;
+static RegisterPass<ConstantPropagation>
+X("constprop", "Simple constant propagation");
+
 FunctionPass *llvm::createConstantPropagationPass() {
   return new ConstantPropagation();
 }
@@ -79,7 +79,7 @@
 
         // Remove the dead instruction.
         WorkList.erase(I);
-        I->getParent()->getInstList().erase(I);
+        I->eraseFromParent();
 
         // We made a change to the function...
         Changed = true;

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/DCE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/DCE.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/DCE.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/DCE.cpp Sun Jul  6 15:45:41 2008
@@ -52,11 +52,12 @@
       AU.setPreservesCFG();
     }
   };
-
-  char DeadInstElimination::ID = 0;
-  RegisterPass<DeadInstElimination> X("die", "Dead Instruction Elimination");
 }
 
+char DeadInstElimination::ID = 0;
+static RegisterPass<DeadInstElimination>
+X("die", "Dead Instruction Elimination");
+
 Pass *llvm::createDeadInstEliminationPass() {
   return new DeadInstElimination();
 }
@@ -76,11 +77,11 @@
       AU.setPreservesCFG();
     }
  };
-
-  char DCE::ID = 0;
-  RegisterPass<DCE> Y("dce", "Dead Code Elimination");
 }
 
+char DCE::ID = 0;
+static RegisterPass<DCE> Y("dce", "Dead Code Elimination");
+
 bool DCE::runOnFunction(Function &F) {
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/DeadStoreElimination.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/DeadStoreElimination.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/DeadStoreElimination.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/DeadStoreElimination.cpp Sun Jul  6 15:45:41 2008
@@ -92,10 +92,11 @@
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
   };
-  char DSE::ID = 0;
-  RegisterPass<DSE> X("dse", "Dead Store Elimination");
 }
 
+char DSE::ID = 0;
+static RegisterPass<DSE> X("dse", "Dead Store Elimination");
+
 FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
@@ -326,9 +327,9 @@
     
     // If we encounter a use of the pointer, it is no longer considered dead
     if (LoadInst* L = dyn_cast<LoadInst>(BBI)) {
-      // However, if this load is unused, we can go ahead and remove it, and
-      // not have to worry about it making our pointer undead!
-      if (L->use_empty()) {
+      // However, if this load is unused and not volatile, we can go ahead and
+      // remove it, and not have to worry about it making our pointer undead!
+      if (L->use_empty() && !L->isVolatile()) {
         MD.removeInstruction(L);
         
         // DCE instructions only used to calculate that load

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/GCSE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/GCSE.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/GCSE.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/GCSE.cpp Sun Jul  6 15:45:41 2008
@@ -9,9 +9,12 @@
 //
 // This pass is designed to be a very quick global transformation that
 // eliminates global common subexpressions from a function.  It does this by
-// using an existing value numbering implementation to identify the common
+// using an existing value numbering analysis pass to identify the common
 // subexpressions, eliminating them when possible.
 //
+// This pass is deprecated by the Global Value Numbering pass (which does a
+// better job with its own value numbering).
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "gcse"
@@ -52,11 +55,12 @@
       AU.addRequired<ValueNumbering>();
     }
   };
-
-  char GCSE::ID = 0;
-  RegisterPass<GCSE> X("gcse", "Global Common Subexpression Elimination");
 }
 
+char GCSE::ID = 0;
+static RegisterPass<GCSE>
+X("gcse", "Global Common Subexpression Elimination");
+
 // createGCSEPass - The public interface to this file...
 FunctionPass *llvm::createGCSEPass() { return new GCSE(); }
 
@@ -197,5 +201,5 @@
   }
 
   // Erase the instruction from the program.
-  I->getParent()->getInstList().erase(I);
+  I->eraseFromParent();
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/GVN.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/GVN.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/GVN.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/GVN.cpp Sun Jul  6 15:45:41 2008
@@ -10,6 +10,9 @@
 // This pass performs global value numbering to eliminate fully redundant
 // instructions.  It also performs simple dead load elimination.
 //
+// Note that this pass does the value numbering itself, it does not use the
+// ValueNumbering analysis passes.
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "gvn"
@@ -18,15 +21,12 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
-#include "llvm/ParameterAttributes.h"
 #include "llvm/Value.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -35,13 +35,15 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Target/TargetData.h"
-#include <list>
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
 
 STATISTIC(NumGVNInstr, "Number of instructions deleted");
 STATISTIC(NumGVNLoad, "Number of loads deleted");
+STATISTIC(NumGVNPRE, "Number of instructions PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(false), cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                         ValueTable Class
@@ -61,8 +63,8 @@
                             FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
                             SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
                             FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
-                            PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, EMPTY,
-                            TOMBSTONE };
+                            PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT,
+                            EMPTY, TOMBSTONE };
 
     ExpressionOpcode opcode;
     const Type* type;
@@ -136,6 +138,7 @@
       DenseMap<Expression, uint32_t> expressionNumbering;
       AliasAnalysis* AA;
       MemoryDependenceAnalysis* MD;
+      DominatorTree* DT;
   
       uint32_t nextValueNumber;
     
@@ -151,6 +154,7 @@
       Expression create_expression(CastInst* C);
       Expression create_expression(GetElementPtrInst* G);
       Expression create_expression(CallInst* C);
+      Expression create_expression(Constant* C);
     public:
       ValueTable() : nextValueNumber(1) { }
       uint32_t lookup_or_add(Value* V);
@@ -161,6 +165,8 @@
       unsigned size();
       void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
       void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+      void setDomTree(DominatorTree* D) { DT = D; }
+      uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
   };
 }
 
@@ -228,7 +234,7 @@
 }
 
 Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
-  if (isa<ICmpInst>(C)) {
+  if (isa<ICmpInst>(C) || isa<VICmpInst>(C)) {
     switch (C->getPredicate()) {
     default:  // THIS SHOULD NEVER HAPPEN
       assert(0 && "Comparison with unknown predicate?");
@@ -244,7 +250,7 @@
     case ICmpInst::ICMP_SLE: return Expression::ICMPSLE;
     }
   }
-  assert(isa<FCmpInst>(C) && "Unknown compare");
+  assert((isa<FCmpInst>(C) || isa<VFCmpInst>(C)) && "Unknown compare");
   switch (C->getPredicate()) {
   default: // THIS SHOULD NEVER HAPPEN
     assert(0 && "Comparison with unknown predicate?");
@@ -394,7 +400,7 @@
 
 Expression ValueTable::create_expression(GetElementPtrInst* G) {
   Expression e;
-    
+  
   e.firstVN = lookup_or_add(G->getPointerOperand());
   e.secondVN = 0;
   e.thirdVN = 0;
@@ -413,6 +419,11 @@
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
 
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value* V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
 /// lookup_or_add - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
 uint32_t ValueTable::lookup_or_add(Value* V) {
@@ -437,26 +448,97 @@
     } else if (AA->onlyReadsMemory(C)) {
       Expression e = create_expression(C);
       
-      Instruction* dep = MD->getDependency(C);
-      
-      if (dep == MemoryDependenceAnalysis::NonLocal ||
-          !isa<CallInst>(dep)) {
+      if (expressionNumbering.find(e) == expressionNumbering.end()) {
         expressionNumbering.insert(std::make_pair(e, nextValueNumber));
         valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      
+      Instruction* local_dep = MD->getDependency(C);
       
+      if (local_dep == MemoryDependenceAnalysis::None) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
         return nextValueNumber++;
+      } else if (local_dep != MemoryDependenceAnalysis::NonLocal) {
+        if (!isa<CallInst>(local_dep)) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        }
+        
+        CallInst* local_cdep = cast<CallInst>(local_dep);
+        
+        if (local_cdep->getCalledFunction() != C->getCalledFunction() ||
+            local_cdep->getNumOperands() != C->getNumOperands()) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        } else if (!C->getCalledFunction()) { 
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        } else {
+          for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+            uint32_t c_vn = lookup_or_add(C->getOperand(i));
+            uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
+            if (c_vn != cd_vn) {
+              valueNumbering.insert(std::make_pair(V, nextValueNumber));
+              return nextValueNumber++;
+            }
+          }
+        
+          uint32_t v = lookup_or_add(local_cdep);
+          valueNumbering.insert(std::make_pair(V, v));
+          return v;
+        }
       }
       
-      CallInst* cdep = cast<CallInst>(dep);
-      Expression d_exp = create_expression(cdep);
       
-      if (e != d_exp) {
-        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      DenseMap<BasicBlock*, Value*> deps;
+      MD->getNonLocalDependency(C, deps);
+      CallInst* cdep = 0;
+      
+      for (DenseMap<BasicBlock*, Value*>::iterator I = deps.begin(),
+           E = deps.end(); I != E; ++I) {
+        if (I->second == MemoryDependenceAnalysis::None) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+
+          return nextValueNumber++;
+        } else if (I->second != MemoryDependenceAnalysis::NonLocal) {
+          if (DT->properlyDominates(I->first, C->getParent())) {
+            if (CallInst* CD = dyn_cast<CallInst>(I->second))
+              cdep = CD;
+            else {
+              valueNumbering.insert(std::make_pair(V, nextValueNumber));
+              return nextValueNumber++;
+            }
+          } else {
+            valueNumbering.insert(std::make_pair(V, nextValueNumber));
+            return nextValueNumber++;
+          }
+        }
+      }
+      
+      if (!cdep) {
         valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
       
+      if (cdep->getCalledFunction() != C->getCalledFunction() ||
+          cdep->getNumOperands() != C->getNumOperands()) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      } else if (!C->getCalledFunction()) { 
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
         return nextValueNumber++;
       } else {
-        uint32_t v = expressionNumbering[d_exp];
+        for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+          uint32_t c_vn = lookup_or_add(C->getOperand(i));
+          uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+          if (c_vn != cd_vn) {
+            valueNumbering.insert(std::make_pair(V, nextValueNumber));
+            return nextValueNumber++;
+          }
+        }
+        
+        uint32_t v = lookup_or_add(cdep);
         valueNumbering.insert(std::make_pair(V, v));
         return v;
       }
@@ -596,53 +678,29 @@
 }
 
 //===----------------------------------------------------------------------===//
-//                       ValueNumberedSet Class
+//                         GVN Pass
 //===----------------------------------------------------------------------===//
-namespace {
-class VISIBILITY_HIDDEN ValueNumberedSet {
-  private:
-    SmallPtrSet<Value*, 8> contents;
-    SparseBitVector<64> numbers;
-  public:
-    ValueNumberedSet() { }
-    ValueNumberedSet(const ValueNumberedSet& other) {
-      numbers = other.numbers;
-      contents = other.contents;
-    }
-    
-    typedef SmallPtrSet<Value*, 8>::iterator iterator;
-    
-    iterator begin() { return contents.begin(); }
-    iterator end() { return contents.end(); }
-    
-    bool insert(Value* v) { return contents.insert(v); }
-    void insert(iterator I, iterator E) { contents.insert(I, E); }
-    void erase(Value* v) { contents.erase(v); }
-    unsigned count(Value* v) { return contents.count(v); }
-    size_t size() { return contents.size(); }
-    
-    void set(unsigned i)  {
-      numbers.set(i);
-    }
-    
-    void operator=(const ValueNumberedSet& other) {
-      contents = other.contents;
-      numbers = other.numbers;
-    }
-    
-    void reset(unsigned i)  {
-      numbers.reset(i);
-    }
-    
-    bool test(unsigned i)  {
-      return numbers.test(i);
+
+namespace llvm {
+  template<> struct DenseMapInfo<uint32_t> {
+    static inline uint32_t getEmptyKey() { return ~0; }
+    static inline uint32_t getTombstoneKey() { return ~0 - 1; }
+    static unsigned getHashValue(const uint32_t& Val) { return Val * 37; }
+    static bool isPod() { return true; }
+    static bool isEqual(const uint32_t& LHS, const uint32_t& RHS) {
+      return LHS == RHS;
     }
-};
+  };
 }
 
-//===----------------------------------------------------------------------===//
-//                         GVN Pass
-//===----------------------------------------------------------------------===//
+namespace {
+  struct VISIBILITY_HIDDEN ValueNumberScope {
+    ValueNumberScope* parent;
+    DenseMap<uint32_t, Value*> table;
+    
+    ValueNumberScope(ValueNumberScope* p) : parent(p) { }
+  };
+}
 
 namespace {
 
@@ -654,8 +712,7 @@
 
   private:
     ValueTable VN;
-    
-    DenseMap<BasicBlock*, ValueNumberedSet> availableOut;
+    DenseMap<BasicBlock*, ValueNumberScope*> localAvail;
     
     typedef DenseMap<Value*, SmallPtrSet<Instruction*, 4> > PhiMapType;
     PhiMapType phiMap;
@@ -663,36 +720,35 @@
     
     // This transformation requires dominator postdominator info
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
       AU.addRequired<DominatorTree>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<TargetData>();
+      
+      AU.addPreserved<DominatorTree>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
-      AU.addPreserved<TargetData>();
     }
   
     // Helper fuctions
     // FIXME: eliminate or document these better
-    Value* find_leader(ValueNumberedSet& vals, uint32_t v) ;
-    void val_insert(ValueNumberedSet& s, Value* v);
     bool processLoad(LoadInst* L,
                      DenseMap<Value*, LoadInst*> &lastLoad,
                      SmallVectorImpl<Instruction*> &toErase);
     bool processInstruction(Instruction* I,
-                            ValueNumberedSet& currAvail,
                             DenseMap<Value*, LoadInst*>& lastSeenLoad,
                             SmallVectorImpl<Instruction*> &toErase);
     bool processNonLocalLoad(LoadInst* L,
                              SmallVectorImpl<Instruction*> &toErase);
+    bool processBlock(DomTreeNode* DTN);
     Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
                             DenseMap<BasicBlock*, Value*> &Phis,
                             bool top_level = false);
-    void dump(DenseMap<BasicBlock*, Value*>& d);
+    void dump(DenseMap<uint32_t, Value*>& d);
     bool iterateOnFunction(Function &F);
     Value* CollapsePhi(PHINode* p);
     bool isSafeReplacement(PHINode* p, Instruction* inst);
+    bool performPRE(Function& F);
+    Value* lookupNumber(BasicBlock* BB, uint32_t num);
   };
   
   char GVN::ID = 0;
@@ -704,37 +760,11 @@
 static RegisterPass<GVN> X("gvn",
                            "Global Value Numbering");
 
-/// find_leader - Given a set and a value number, return the first
-/// element of the set with that value number, or 0 if no such element
-/// is present
-Value* GVN::find_leader(ValueNumberedSet& vals, uint32_t v) {
-  if (!vals.test(v))
-    return 0;
-  
-  for (ValueNumberedSet::iterator I = vals.begin(), E = vals.end();
-       I != E; ++I)
-    if (v == VN.lookup(*I))
-      return *I;
-  
-  assert(0 && "No leader found, but present bit is set?");
-  return 0;
-}
-
-/// val_insert - Insert a value into a set only if there is not a value
-/// with the same value number already in the set
-void GVN::val_insert(ValueNumberedSet& s, Value* v) {
-  uint32_t num = VN.lookup(v);
-  if (!s.test(num))
-    s.insert(v);
-}
-
-void GVN::dump(DenseMap<BasicBlock*, Value*>& d) {
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   printf("{\n");
-  for (DenseMap<BasicBlock*, Value*>::iterator I = d.begin(),
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
-    if (I->second == MemoryDependenceAnalysis::None)
-      printf("None\n");
-    else
+      printf("%d\n", I->first);
       I->second->dump();
   }
   printf("}\n");
@@ -779,6 +809,11 @@
   DenseMap<BasicBlock*, Value*>::iterator V = Phis.find(BB);
   if (V != Phis.end() && !top_level) return V->second;
   
+  // If the block is unreachable, just return undef, since this path
+  // can't actually occur at runtime.
+  if (!getAnalysis<DominatorTree>().isReachableFromEntry(BB))
+    return Phis[BB] = UndefValue::get(orig->getType());
+  
   BasicBlock* singlePred = BB->getSinglePredecessor();
   if (singlePred) {
     Value *ret = GetValueForBlock(singlePred, orig, Phis);
@@ -990,20 +1025,49 @@
   return deletedLoad;
 }
 
+Value* GVN::lookupNumber(BasicBlock* BB, uint32_t num) {
+  DenseMap<BasicBlock*, ValueNumberScope*>::iterator I = localAvail.find(BB);
+  if (I == localAvail.end())
+    return 0;
+  
+  ValueNumberScope* locals = I->second;
+  
+  while (locals) {
+    DenseMap<uint32_t, Value*>::iterator I = locals->table.find(num);
+    if (I != locals->table.end())
+      return I->second;
+    else
+      locals = locals->parent;
+  }
+  
+  return 0;
+}
+
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
-bool GVN::processInstruction(Instruction *I, ValueNumberedSet &currAvail,
+bool GVN::processInstruction(Instruction *I,
                              DenseMap<Value*, LoadInst*> &lastSeenLoad,
                              SmallVectorImpl<Instruction*> &toErase) {
-  if (LoadInst* L = dyn_cast<LoadInst>(I))
-    return processLoad(L, lastSeenLoad, toErase);
+  if (LoadInst* L = dyn_cast<LoadInst>(I)) {
+    bool changed = processLoad(L, lastSeenLoad, toErase);
+    
+    if (!changed) {
+      unsigned num = VN.lookup_or_add(L);
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, L));
+    }
+    
+    return changed;
+  }
+  
+  uint32_t nextNum = VN.getNextUnusedValueNumber();
+  unsigned num = VN.lookup_or_add(I);
   
   // Allocations are always uniquely numbered, so we can save time and memory
   // by fast failing them.
-  if (isa<AllocationInst>(I))
+  if (isa<AllocationInst>(I) || isa<TerminatorInst>(I)) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
     return false;
-  
-  unsigned num = VN.lookup_or_add(I);
+  }
   
   // Collapse PHI nodes
   if (PHINode* p = dyn_cast<PHINode>(I)) {
@@ -1017,11 +1081,18 @@
         
       p->replaceAllUsesWith(constVal);
       toErase.push_back(p);
+    } else {
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
     }
-  // Perform value-number based elimination
-  } else if (currAvail.test(num)) {
-    Value* repl = find_leader(currAvail, num);
+  
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  } else if (num == nextNum) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
     
+  // Perform value-number based elimination
+  } else if (Value* repl = lookupNumber(I->getParent(), num)) {
     // Remove it!
     MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
     MD.removeInstruction(I);
@@ -1030,9 +1101,8 @@
     I->replaceAllUsesWith(repl);
     toErase.push_back(I);
     return true;
-  } else if (!I->isTerminator()) {
-    currAvail.set(num);
-    currAvail.insert(I);
+  } else {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
   }
   
   return false;
@@ -1044,6 +1114,7 @@
 bool GVN::runOnFunction(Function& F) {
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(&getAnalysis<MemoryDependenceAnalysis>());
+  VN.setDomTree(&getAnalysis<DominatorTree>());
   
   bool changed = false;
   bool shouldContinue = true;
@@ -1057,72 +1128,220 @@
 }
 
 
-// GVN::iterateOnFunction - Executes one iteration of GVN
-bool GVN::iterateOnFunction(Function &F) {
-  // Clean out global sets from any previous functions
-  VN.clear();
-  availableOut.clear();
-  phiMap.clear();
- 
-  bool changed_function = false;
-  
-  DominatorTree &DT = getAnalysis<DominatorTree>();   
-  
+bool GVN::processBlock(DomTreeNode* DTN) {
+  BasicBlock* BB = DTN->getBlock();
+
   SmallVector<Instruction*, 8> toErase;
   DenseMap<Value*, LoadInst*> lastSeenLoad;
-  DenseMap<DomTreeNode*, size_t> numChildrenVisited;
-
-  // Top-down walk of the dominator tree
-  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
-         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+  bool changed_function = false;
+  
+  if (DTN->getIDom())
+    localAvail[BB] =
+                  new ValueNumberScope(localAvail[DTN->getIDom()->getBlock()]);
+  else
+    localAvail[BB] = new ValueNumberScope(0);
+  
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    changed_function |= processInstruction(BI, lastSeenLoad, toErase);
+    if (toErase.empty()) {
+      ++BI;
+      continue;
+    }
     
-    // Get the set to update for this block
-    ValueNumberedSet& currAvail = availableOut[DI->getBlock()];     
-    lastSeenLoad.clear();
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += toErase.size();
+    
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
+         E = toErase.end(); I != E; ++I)
+      (*I)->eraseFromParent();
 
-    BasicBlock* BB = DI->getBlock();
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+    
+    toErase.clear();
+  }
   
-    // A block inherits AVAIL_OUT from its dominator
-    if (DI->getIDom() != 0) {
-      currAvail = availableOut[DI->getIDom()->getBlock()];
+  return changed_function;
+}
+
+/// performPRE - Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function& F) {
+  bool changed = false;
+  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+  for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+       DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+    BasicBlock* CurrentBlock = *DI;
+    
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock()) continue;
+    
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+         BE = CurrentBlock->end(); BI != BE; ) {
+      if (isa<AllocationInst>(BI) || isa<TerminatorInst>(BI) ||
+          isa<PHINode>(BI) || BI->mayReadFromMemory() ||
+          BI->mayWriteToMemory()) {
+        BI++;
+        continue;
+      }
+      
+      uint32_t valno = VN.lookup(BI);
       
-      numChildrenVisited[DI->getIDom()]++;
+      // Look for the predecessors for PRE opportunities.  We're
+      // only trying to solve the basic diamond case, where
+      // a value is computed in the successor and one predecessor,
+      // but not the other.  We also explicitly disallow cases
+      // where the successor is its own predecessor, because they're
+      // more complicated to get right.
+      unsigned numWith = 0;
+      unsigned numWithout = 0;
+      BasicBlock* PREPred = 0;
+      DenseMap<BasicBlock*, Value*> predMap;
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        // We're not interested in PRE where the block is its
+        // own predecessor, on in blocks with predecessors
+        // that are not reachable.
+        if (*PI == CurrentBlock) {
+          numWithout = 2;
+          break;
+        } else if (!localAvail.count(*PI))  {
+          numWithout = 2;
+          break;
+        }
+        
+        DenseMap<uint32_t, Value*>::iterator predV = 
+                                            localAvail[*PI]->table.find(valno);
+        if (predV == localAvail[*PI]->table.end()) {
+          PREPred = *PI;
+          numWithout++;
+        } else if (predV->second == BI) {
+          numWithout = 2;
+        } else {
+          predMap[*PI] = predV->second;
+          numWith++;
+        }
+      }
       
-      if (numChildrenVisited[DI->getIDom()] == DI->getIDom()->getNumChildren()) {
-        availableOut.erase(DI->getIDom()->getBlock());
-        numChildrenVisited.erase(DI->getIDom());
+      // Don't do PRE when it might increase code size, i.e. when
+      // we would need to insert instructions in more than one pred.
+      if (numWithout != 1 || numWith == 0) {
+        BI++;
+        continue;
       }
-    }
-
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-         BI != BE;) {
-      changed_function |= processInstruction(BI, currAvail,
-                                             lastSeenLoad, toErase);
-      if (toErase.empty()) {
-        ++BI;
+      
+      // We can't do PRE safely on a critical edge, so instead we schedule
+      // the edge to be split and perform the PRE the next time we iterate
+      // on the function.
+      unsigned succNum = 0;
+      for (unsigned i = 0, e = PREPred->getTerminator()->getNumSuccessors();
+           i != e; ++i)
+        if (PREPred->getTerminator()->getSuccessor(i) == PREPred) {
+          succNum = i;
+          break;
+        }
+        
+      if (isCriticalEdge(PREPred->getTerminator(), succNum)) {
+        toSplit.push_back(std::make_pair(PREPred->getTerminator(), succNum));
+        changed = true;
+        BI++;
         continue;
       }
       
-      // If we need some instructions deleted, do it now.
-      NumGVNInstr += toErase.size();
+      // Instantiate the expression the in predecessor that lacked it.
+      // Because we are going top-down through the block, all value numbers
+      // will be available in the predecessor by the time we need them.  Any
+      // that weren't original present will have been instantiated earlier
+      // in this loop.
+      Instruction* PREInstr = BI->clone();
+      bool success = true;
+      for (unsigned i = 0; i < BI->getNumOperands(); ++i) {
+        Value* op = BI->getOperand(i);
+        if (isa<Argument>(op) || isa<Constant>(op) || isa<GlobalValue>(op))
+          PREInstr->setOperand(i, op);
+        else if (!lookupNumber(PREPred, VN.lookup(op))) {
+          success = false;
+          break;
+        } else
+          PREInstr->setOperand(i, lookupNumber(PREPred, VN.lookup(op)));
+      }
       
-      // Avoid iterator invalidation.
-      bool AtStart = BI == BB->begin();
-      if (!AtStart)
-        --BI;
-
-      for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
-           E = toErase.end(); I != E; ++I)
-        (*I)->eraseFromParent();
-
-      if (AtStart)
-        BI = BB->begin();
-      else
-        ++BI;
+      // Fail out if we encounter an operand that is not available in
+      // the PRE predecessor.  This is typically because of loads which 
+      // are not value numbered precisely.
+      if (!success) {
+        delete PREInstr;
+        BI++;
+        continue;
+      }
+      
+      PREInstr->insertBefore(PREPred->getTerminator());
+      PREInstr->setName(BI->getName() + ".pre");
+      predMap[PREPred] = PREInstr;
+      VN.add(PREInstr, valno);
+      NumGVNPRE++;
+      
+      // Update the availability map to include the new instruction.
+      localAvail[PREPred]->table.insert(std::make_pair(valno, PREInstr));
+      
+      // Create a PHI to make the value available in this block.
+      PHINode* Phi = PHINode::Create(BI->getType(),
+                                     BI->getName() + ".pre-phi",
+                                     CurrentBlock->begin());
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI)
+        Phi->addIncoming(predMap[*PI], *PI);
       
-      toErase.clear();
+      VN.add(Phi, valno);
+      localAvail[CurrentBlock]->table[valno] = Phi;
+      
+      BI->replaceAllUsesWith(Phi);
+      VN.erase(BI);
+      
+      Instruction* erase = BI;
+      BI++;
+      erase->eraseFromParent();
+      
+      changed = true;
     }
   }
   
-  return changed_function;
+  for (SmallVector<std::pair<TerminatorInst*, unsigned>, 4>::iterator
+       I = toSplit.begin(), E = toSplit.end(); I != E; ++I)
+    SplitCriticalEdge(I->first, I->second, this);
+  
+  return changed;
+}
+
+// GVN::iterateOnFunction - Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  // Clean out global sets from any previous functions
+  VN.clear();
+  phiMap.clear();
+  
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+       I = localAvail.begin(), E = localAvail.end(); I != E; ++I)
+    delete I->second;
+  localAvail.clear();
+  
+  DominatorTree &DT = getAnalysis<DominatorTree>();   
+
+  // Top-down walk of the dominator tree
+  bool changed = false;
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+       DE = df_end(DT.getRootNode()); DI != DE; ++DI)
+    changed |= processBlock(*DI);
+  
+  if (EnablePRE)
+    changed |= performPRE(F);
+  
+  return changed;
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/GVNPRE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/GVNPRE.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/GVNPRE.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/GVNPRE.cpp Sun Jul  6 15:45:41 2008
@@ -16,6 +16,9 @@
 // live ranges, and should be used with caution on platforms that are very 
 // sensitive to register pressure.
 //
+// Note that this pass does the value numbering itself, it does not use the
+// ValueNumbering analysis passes.
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "gvnpre"
@@ -45,6 +48,8 @@
 //                         ValueTable Class
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 /// This class holds the mapping between values and value numbers.  It is used
 /// as an efficient mechanism to determine the expression-wise equivalence of
 /// two values.
@@ -123,6 +128,7 @@
   }
 };
 
+}
 
 namespace {
   class VISIBILITY_HIDDEN ValueTable {
@@ -596,6 +602,8 @@
   return nextValueNumber;
 }
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 //                       ValueNumberedSet Class
 //===----------------------------------------------------------------------===//
@@ -652,6 +660,8 @@
     }
 };
 
+}
+
 //===----------------------------------------------------------------------===//
 //                         GVNPRE Pass
 //===----------------------------------------------------------------------===//
@@ -797,7 +807,7 @@
     if (newOp1 != U->getOperand(0)) {
       Instruction* newVal = 0;
       if (CastInst* C = dyn_cast<CastInst>(U))
-        newVal = CastInst::create(C->getOpcode(),
+        newVal = CastInst::Create(C->getOpcode(),
                                   newOp1, C->getType(),
                                   C->getName()+".expr");
       
@@ -840,11 +850,11 @@
     if (newOp1 != U->getOperand(0) || newOp2 != U->getOperand(1)) {
       Instruction* newVal = 0;
       if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
-        newVal = BinaryOperator::create(BO->getOpcode(),
+        newVal = BinaryOperator::Create(BO->getOpcode(),
                                         newOp1, newOp2,
                                         BO->getName()+".expr");
       else if (CmpInst* C = dyn_cast<CmpInst>(U))
-        newVal = CmpInst::create(C->getOpcode(),
+        newVal = CmpInst::Create(C->getOpcode(),
                                  C->getPredicate(),
                                  newOp1, newOp2,
                                  C->getName()+".expr");
@@ -902,12 +912,13 @@
       Instruction* newVal = 0;
       if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
         newVal = new ShuffleVectorInst(newOp1, newOp2, newOp3,
-                                       S->getName()+".expr");
+                                       S->getName() + ".expr");
       else if (InsertElementInst* I = dyn_cast<InsertElementInst>(U))
         newVal = InsertElementInst::Create(newOp1, newOp2, newOp3,
-                                           I->getName()+".expr");
+                                           I->getName() + ".expr");
       else if (SelectInst* I = dyn_cast<SelectInst>(U))
-        newVal = SelectInst::Create(newOp1, newOp2, newOp3, I->getName()+".expr");
+        newVal = SelectInst::Create(newOp1, newOp2, newOp3,
+                                    I->getName() + ".expr");
       
       uint32_t v = VN.lookup_or_add(newVal);
       
@@ -1657,11 +1668,11 @@
       
       Value* newVal = 0;
       if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
-        newVal = BinaryOperator::create(BO->getOpcode(), s1, s2,
+        newVal = BinaryOperator::Create(BO->getOpcode(), s1, s2,
                                         BO->getName()+".gvnpre",
                                         (*PI)->getTerminator());
       else if (CmpInst* C = dyn_cast<CmpInst>(U))
-        newVal = CmpInst::create(C->getOpcode(), C->getPredicate(), s1, s2,
+        newVal = CmpInst::Create(C->getOpcode(), C->getPredicate(), s1, s2,
                                  C->getName()+".gvnpre", 
                                  (*PI)->getTerminator());
       else if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
@@ -1677,7 +1688,7 @@
         newVal = SelectInst::Create(s1, s2, s3, S->getName()+".gvnpre",
                                     (*PI)->getTerminator());
       else if (CastInst* C = dyn_cast<CastInst>(U))
-        newVal = CastInst::create(C->getOpcode(), s1, C->getType(),
+        newVal = CastInst::Create(C->getOpcode(), s1, C->getType(),
                                   C->getName()+".gvnpre", 
                                   (*PI)->getTerminator());
       else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U))

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/IndVarSimplify.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/IndVarSimplify.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/IndVarSimplify.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/IndVarSimplify.cpp Sun Jul  6 15:45:41 2008
@@ -94,11 +94,12 @@
 
     void DeleteTriviallyDeadInstructions(std::set<Instruction*> &Insts);
   };
-
-  char IndVarSimplify::ID = 0;
-  RegisterPass<IndVarSimplify> X("indvars", "Canonicalize Induction Variables");
 }
 
+char IndVarSimplify::ID = 0;
+static RegisterPass<IndVarSimplify>
+X("indvars", "Canonicalize Induction Variables");
+
 LoopPass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
 }
@@ -150,7 +151,7 @@
       NewPhi->addIncoming(Constant::getNullValue(NewPhi->getType()), Preheader);
 
       // Create the new add instruction.
-      Value *NewAdd = BinaryOperator::createAdd(NewPhi, AddedVal,
+      Value *NewAdd = BinaryOperator::CreateAdd(NewPhi, AddedVal,
                                                 GEPI->getName()+".rec", GEPI);
       NewPhi->addIncoming(NewAdd, PN->getIncomingBlock(BackedgeIdx));
 
@@ -318,8 +319,7 @@
     BlockToInsertInto = ExitBlocks[0];
   else
     BlockToInsertInto = Preheader;
-  BasicBlock::iterator InsertPt = BlockToInsertInto->begin();
-  while (isa<PHINode>(InsertPt)) ++InsertPt;
+  BasicBlock::iterator InsertPt = BlockToInsertInto->getFirstNonPHI();
 
   bool HasConstantItCount = isa<SCEVConstant>(SE->getIterationCount(L));
 
@@ -522,11 +522,7 @@
   DOUT << "INDVARS: New CanIV: " << *IndVar;
 
   if (!isa<SCEVCouldNotCompute>(IterationCount)) {
-    if (IterationCount->getType()->getPrimitiveSizeInBits() <
-        LargestType->getPrimitiveSizeInBits())
-      IterationCount = SE->getZeroExtendExpr(IterationCount, LargestType);
-    else if (IterationCount->getType() != LargestType)
-      IterationCount = SE->getTruncateExpr(IterationCount, LargestType);
+    IterationCount = SE->getTruncateOrZeroExtend(IterationCount, LargestType);
     if (Instruction *DI = LinearFunctionTestReplace(L, IterationCount,Rewriter))
       DeadInsts.insert(DI);
   }
@@ -534,8 +530,7 @@
   // Now that we have a canonical induction variable, we can rewrite any
   // recurrences in terms of the induction variable.  Start with the auxillary
   // induction variables, and recursively rewrite any of their uses.
-  BasicBlock::iterator InsertPt = Header->begin();
-  while (isa<PHINode>(InsertPt)) ++InsertPt;
+  BasicBlock::iterator InsertPt = Header->getFirstNonPHI();
 
   // If there were induction variables of other sizes, cast the primary
   // induction variable to the right size for them, avoiding the need for the
@@ -579,9 +574,10 @@
 #if 0
   // Now replace all derived expressions in the loop body with simpler
   // expressions.
-  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i)
-    if (LI->getLoopFor(L->getBlocks()[i]) == L) {  // Not in a subloop...
-      BasicBlock *BB = L->getBlocks()[i];
+  for (LoopInfo::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L) {  // Not in a subloop...
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         if (I->getType()->isInteger() &&      // Is an integer instruction
             !I->use_empty() &&
@@ -598,6 +594,7 @@
           }
         }
     }
+  }
 #endif
 
   DeleteTriviallyDeadInstructions(DeadInsts);

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/InstructionCombining.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/InstructionCombining.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/InstructionCombining.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/InstructionCombining.cpp Sun Jul  6 15:45:41 2008
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // InstructionCombining - Combine instructions to form fewer, simple
-// instructions.  This pass does not modify the CFG This pass is where algebraic
-// simplification happens.
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
 //
 // This pass combines things like:
 //    %Y = add i32 %X, 1
@@ -40,6 +40,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -121,8 +122,8 @@
     /// the work lists because they might get more simplified now.
     ///
     void AddUsesToWorkList(Instruction &I) {
-      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
-        if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i)))
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i))
           AddToWorkList(Op);
     }
     
@@ -135,11 +136,11 @@
     Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
       Value *R = I.getOperand(op);
       
-      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
-        if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i))) {
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i)) {
           AddToWorkList(Op);
           // Set the operand to undef to drop the use.
-          I.setOperand(i, UndefValue::get(Op->getType()));
+          *i = UndefValue::get(Op->getType());
         }
       
       return R;
@@ -185,6 +186,8 @@
     Instruction *visitAShr(BinaryOperator &I);
     Instruction *visitLShr(BinaryOperator &I);
     Instruction *commonShiftTransforms(BinaryOperator &I);
+    Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
+                                      Constant *RHSC);
     Instruction *visitFCmpInst(FCmpInst &I);
     Instruction *visitICmpInst(ICmpInst &I);
     Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
@@ -206,8 +209,8 @@
     Instruction *visitSExt(SExtInst &CI);
     Instruction *visitFPTrunc(FPTruncInst &CI);
     Instruction *visitFPExt(CastInst &CI);
-    Instruction *visitFPToUI(CastInst &CI);
-    Instruction *visitFPToSI(CastInst &CI);
+    Instruction *visitFPToUI(FPToUIInst &FI);
+    Instruction *visitFPToSI(FPToSIInst &FI);
     Instruction *visitUIToFP(CastInst &CI);
     Instruction *visitSIToFP(CastInst &CI);
     Instruction *visitPtrToInt(CastInst &CI);
@@ -229,6 +232,7 @@
     Instruction *visitInsertElementInst(InsertElementInst &IE);
     Instruction *visitExtractElementInst(ExtractElementInst &EI);
     Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+    Instruction *visitExtractValueInst(ExtractValueInst &EV);
 
     // visitInstruction - Specify what to return for unhandled instructions...
     Instruction *visitInstruction(Instruction &I) { return 0; }
@@ -239,6 +243,7 @@
     Instruction *transformCallThroughTrampoline(CallSite CS);
     Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                    bool DoXform = true);
+    bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
 
   public:
     // InsertNewInstBefore - insert an instruction New before instruction Old
@@ -263,7 +268,7 @@
       if (Constant *CV = dyn_cast<Constant>(V))
         return ConstantExpr::getCast(opc, CV, Ty);
       
-      Instruction *C = CastInst::create(opc, V, Ty, V->getName(), &Pos);
+      Instruction *C = CastInst::Create(opc, V, Ty, V->getName(), &Pos);
       AddToWorkList(C);
       return C;
     }
@@ -320,6 +325,19 @@
       I.eraseFromParent();
       return 0;  // Don't do anything with FI
     }
+        
+    void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero,
+                           APInt &KnownOne, unsigned Depth = 0) const {
+      return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+    }
+    
+    bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+                           unsigned Depth = 0) const {
+      return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
+    }
+    unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const {
+      return llvm::ComputeNumSignBits(Op, TD, Depth);
+    }
 
   private:
     /// InsertOperandCastBefore - This inserts a cast of V to DestTy before the
@@ -370,24 +388,24 @@
     Instruction *MatchBSwap(BinaryOperator &I);
     bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
     Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+    Instruction *SimplifyMemSet(MemSetInst *MI);
 
 
     Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
 
-    void ComputeMaskedBits(Value *V, const APInt &Mask, APInt& KnownZero, 
-                           APInt& KnownOne, unsigned Depth = 0);
-    bool MaskedValueIsZero(Value *V, const APInt& Mask, unsigned Depth = 0);
     bool CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
                                     unsigned CastOpc,
                                     int &NumCastsRemoved);
     unsigned GetOrEnforceKnownAlignment(Value *V,
                                         unsigned PrefAlign = 0);
-  };
 
-  char InstCombiner::ID = 0;
-  RegisterPass<InstCombiner> X("instcombine", "Combine redundant instructions");
+  };
 }
 
+char InstCombiner::ID = 0;
+static RegisterPass<InstCombiner>
+X("instcombine", "Combine redundant instructions");
+
 // getComplexity:  Assign a complexity or rank value to LLVM Values...
 //   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static unsigned getComplexity(Value *V) {
@@ -511,7 +529,7 @@
 
           // Fold (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
           Constant *Folded = ConstantExpr::get(I.getOpcode(), C1, C2);
-          Instruction *New = BinaryOperator::create(Opcode, Op->getOperand(0),
+          Instruction *New = BinaryOperator::Create(Opcode, Op->getOperand(0),
                                                     Op1->getOperand(0),
                                                     Op1->getName(), &I);
           AddToWorkList(New);
@@ -544,6 +562,11 @@
   // Constants can be considered to be negated values if they can be folded.
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
     return ConstantExpr::getNeg(C);
+
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+    if (C->getType()->getElementType()->isInteger())
+      return ConstantExpr::getNeg(C);
+
   return 0;
 }
 
@@ -592,10 +615,10 @@
 
 /// getOpcode - If this is an Instruction or a ConstantExpr, return the
 /// opcode value. Otherwise return UserOp1.
-static unsigned getOpcode(User *U) {
-  if (Instruction *I = dyn_cast<Instruction>(U))
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
     return I->getOpcode();
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U))
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
     return CE->getOpcode();
   // Use UserOp1 to mean there's no opcode.
   return Instruction::UserOp1;
@@ -650,457 +673,6 @@
     return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
 }
 
-/// ComputeMaskedBits - Determine which of the bits specified in Mask are
-/// known to be either zero or one and return them in the KnownZero/KnownOne
-/// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
-/// processing.
-/// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
-/// we cannot optimize based on the assumption that it is zero without changing
-/// it to be an explicit zero.  If we don't change it to zero, other code could
-/// optimized based on the contradictory assumption that it is non-zero.
-/// Because instcombine aggressively folds operations with undef args anyway,
-/// this won't lose us code quality.
-void InstCombiner::ComputeMaskedBits(Value *V, const APInt &Mask,
-                                     APInt& KnownZero, APInt& KnownOne,
-                                     unsigned Depth) {
-  assert(V && "No Value?");
-  assert(Depth <= 6 && "Limit Search Depth");
-  uint32_t BitWidth = Mask.getBitWidth();
-  assert((V->getType()->isInteger() || isa<PointerType>(V->getType())) &&
-         "Not integer or pointer type!");
-  assert((!TD || TD->getTypeSizeInBits(V->getType()) == BitWidth) &&
-         (!isa<IntegerType>(V->getType()) ||
-          V->getType()->getPrimitiveSizeInBits() == BitWidth) &&
-         KnownZero.getBitWidth() == BitWidth && 
-         KnownOne.getBitWidth() == BitWidth &&
-         "V, Mask, KnownOne and KnownZero should have same BitWidth");
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    // We know all of the bits for a constant!
-    KnownOne = CI->getValue() & Mask;
-    KnownZero = ~KnownOne & Mask;
-    return;
-  }
-  // Null is all-zeros.
-  if (isa<ConstantPointerNull>(V)) {
-    KnownOne.clear();
-    KnownZero = Mask;
-    return;
-  }
-  // The address of an aligned GlobalValue has trailing zeros.
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    unsigned Align = GV->getAlignment();
-    if (Align == 0 && TD && GV->getType()->getElementType()->isSized()) 
-      Align = TD->getPrefTypeAlignment(GV->getType()->getElementType());
-    if (Align > 0)
-      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
-                                              CountTrailingZeros_32(Align));
-    else
-      KnownZero.clear();
-    KnownOne.clear();
-    return;
-  }
-
-  if (Depth == 6 || Mask == 0)
-    return;  // Limit search depth.
-
-  User *I = dyn_cast<User>(V);
-  if (!I) return;
-
-  KnownZero.clear(); KnownOne.clear();   // Don't know anything.
-  APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
-  
-  switch (getOpcode(I)) {
-  default: break;
-  case Instruction::And: {
-    // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    APInt Mask2(Mask & ~KnownZero);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne &= KnownOne2;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero |= KnownZero2;
-    return;
-  }
-  case Instruction::Or: {
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    APInt Mask2(Mask & ~KnownOne);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero &= KnownZero2;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne |= KnownOne2;
-    return;
-  }
-  case Instruction::Xor: {
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
-    KnownZero = KnownZeroOut;
-    return;
-  }
-  case Instruction::Mul: {
-    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(I->getOperand(1), Mask2, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    
-    // If low bits are zero in either operand, output low known-0 bits.
-    // More trickiness is possible, but this is sufficient for the
-    // interesting case of alignment computation.
-    KnownOne.clear();
-    unsigned TrailZ = KnownZero.countTrailingOnes() +
-                      KnownZero2.countTrailingOnes();
-    TrailZ = std::min(TrailZ, BitWidth);
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
-    KnownZero &= Mask;
-    return;
-  }
-  case Instruction::Select:
-    ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-
-    // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
-    return;
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-    return; // Can't work with floating point.
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-    // We can't handle these if we don't know the pointer size.
-    if (!TD) return;
-    // Fall through and handle them the same as zext/trunc.
-  case Instruction::ZExt:
-  case Instruction::Trunc: {
-    // All these have integer operands
-    const Type *SrcTy = I->getOperand(0)->getType();
-    uint32_t SrcBitWidth = TD ?
-      TD->getTypeSizeInBits(SrcTy) :
-      SrcTy->getPrimitiveSizeInBits();
-    APInt MaskIn(Mask);
-    MaskIn.zextOrTrunc(SrcBitWidth);
-    KnownZero.zextOrTrunc(SrcBitWidth);
-    KnownOne.zextOrTrunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, Depth+1);
-    KnownZero.zextOrTrunc(BitWidth);
-    KnownOne.zextOrTrunc(BitWidth);
-    // Any top bits are known to be zero.
-    if (BitWidth > SrcBitWidth)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    return;
-  }
-  case Instruction::BitCast: {
-    const Type *SrcTy = I->getOperand(0)->getType();
-    if (SrcTy->isInteger() || isa<PointerType>(SrcTy)) {
-      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
-      return;
-    }
-    break;
-  }
-  case Instruction::SExt: {
-    // Compute the bits in the result that are not present in the input.
-    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
-    uint32_t SrcBitWidth = SrcTy->getBitWidth();
-      
-    APInt MaskIn(Mask); 
-    MaskIn.trunc(SrcBitWidth);
-    KnownZero.trunc(SrcBitWidth);
-    KnownOne.trunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    KnownZero.zext(BitWidth);
-    KnownOne.zext(BitWidth);
-
-    // If the sign bit of the input is known set or clear, then we know the
-    // top bits of the result.
-    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
-      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    return;
-  }
-  case Instruction::Shl:
-    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      APInt Mask2(Mask.lshr(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-      KnownZero <<= ShiftAmt;
-      KnownOne  <<= ShiftAmt;
-      KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
-      return;
-    }
-    break;
-  case Instruction::LShr:
-    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // Compute the new bits that are at the top now.
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      
-      // Unsigned shift right.
-      APInt Mask2(Mask.shl(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne,Depth+1);
-      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-      // high bits known zero.
-      KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-      return;
-    }
-    break;
-  case Instruction::AShr:
-    // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // Compute the new bits that are at the top now.
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      
-      // Signed shift right.
-      APInt Mask2(Mask.shl(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne,Depth+1);
-      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-        
-      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-      if (KnownZero[BitWidth-ShiftAmt-1])    // New bits are known zero.
-        KnownZero |= HighBits;
-      else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
-        KnownOne |= HighBits;
-      return;
-    }
-    break;
-  case Instruction::Sub: {
-    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(I->getOperand(0))) {
-      // We know that the top bits of C-X are clear if X contains less bits
-      // than C (i.e. no wrap-around can happen).  For example, 20-X is
-      // positive if we can prove that X is >= 0 and < 16.
-      if (!CLHS->getValue().isNegative()) {
-        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
-        // NLZ can't be BitWidth with no sign bit
-        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        ComputeMaskedBits(I->getOperand(1), MaskV, KnownZero, KnownOne, Depth+1);
-    
-        // If all of the MaskV bits are known to be zero, then we know the output
-        // top bits are zero, because we now know that the output is from [0-C].
-        if ((KnownZero & MaskV) == MaskV) {
-          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
-          // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
-          KnownOne = APInt(BitWidth, 0);   // No one bits known.
-        } else {
-          KnownZero = KnownOne = APInt(BitWidth, 0);  // Otherwise, nothing known.
-        }
-        return;
-      }        
-    }
-  }
-  // fall through
-  case Instruction::Add: {
-    // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-      
-    // Output known-0 bits are known if clear or set in both the low clear bits
-    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
-    // low 3 bits clear.
-    unsigned KnownZeroOut = std::min(KnownZero.countTrailingOnes(), 
-                                     KnownZero2.countTrailingOnes());
-      
-    KnownZero = APInt::getLowBitsSet(BitWidth, KnownZeroOut);
-    KnownOne = APInt(BitWidth, 0);
-    return;
-  }
-  case Instruction::SRem:
-    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      APInt RA = Rem->getValue();
-      if (RA.isPowerOf2() || (-RA).isPowerOf2()) {
-        APInt LowBits = RA.isStrictlyPositive() ? ((RA - 1) | RA) : ~RA;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        ComputeMaskedBits(I->getOperand(0), Mask2,KnownZero2,KnownOne2,Depth+1);
-
-        // The sign of a remainder is equal to the sign of the first
-        // operand (zero being positive).
-        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
-          KnownZero2 |= ~LowBits;
-        else if (KnownOne2[BitWidth-1])
-          KnownOne2 |= ~LowBits;
-
-        KnownZero |= KnownZero2 & Mask;
-        KnownOne |= KnownOne2 & Mask;
-
-        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
-      }
-    }
-    break;
-  case Instruction::URem:
-    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      APInt RA = Rem->getValue();
-      if (RA.isStrictlyPositive() && RA.isPowerOf2()) {
-        APInt LowBits = (RA - 1) | RA;
-        APInt Mask2 = LowBits & Mask;
-        KnownZero |= ~LowBits & Mask;
-        ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne,Depth+1);
-        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
-      }
-    } else {
-      // Since the result is less than or equal to RHS, any leading zero bits
-      // in RHS must also exist in the result.
-      APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-      ComputeMaskedBits(I->getOperand(1), AllOnes, KnownZero2, KnownOne2,
-                        Depth+1);
-
-      uint32_t Leaders = KnownZero2.countLeadingOnes();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
-      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
-    }
-    break;
-
-  case Instruction::Alloca:
-  case Instruction::Malloc: {
-    AllocationInst *AI = cast<AllocationInst>(V);
-    unsigned Align = AI->getAlignment();
-    if (Align == 0 && TD) {
-      if (isa<AllocaInst>(AI))
-        Align = TD->getPrefTypeAlignment(AI->getType()->getElementType());
-      else if (isa<MallocInst>(AI)) {
-        // Malloc returns maximally aligned memory.
-        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
-        Align =
-          std::max(Align,
-                   (unsigned)TD->getABITypeAlignment(Type::DoubleTy));
-        Align =
-          std::max(Align,
-                   (unsigned)TD->getABITypeAlignment(Type::Int64Ty));
-      }
-    }
-    
-    if (Align > 0)
-      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
-                                              CountTrailingZeros_32(Align));
-    break;
-  }
-  case Instruction::GetElementPtr: {
-    // Analyze all of the subscripts of this getelementptr instruction
-    // to determine if we can prove known low zero bits.
-    APInt LocalMask = APInt::getAllOnesValue(BitWidth);
-    APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
-    ComputeMaskedBits(I->getOperand(0), LocalMask,
-                      LocalKnownZero, LocalKnownOne, Depth+1);
-    unsigned TrailZ = LocalKnownZero.countTrailingOnes();
-
-    gep_type_iterator GTI = gep_type_begin(I);
-    for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
-      Value *Index = I->getOperand(i);
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
-        // Handle struct member offset arithmetic.
-        if (!TD) return;
-        const StructLayout *SL = TD->getStructLayout(STy);
-        unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
-        uint64_t Offset = SL->getElementOffset(Idx);
-        TrailZ = std::min(TrailZ,
-                          CountTrailingZeros_64(Offset));
-      } else {
-        // Handle array index arithmetic.
-        const Type *IndexedTy = GTI.getIndexedType();
-        if (!IndexedTy->isSized()) return;
-        unsigned GEPOpiBits = Index->getType()->getPrimitiveSizeInBits();
-        uint64_t TypeSize = TD ? TD->getABITypeSize(IndexedTy) : 1;
-        LocalMask = APInt::getAllOnesValue(GEPOpiBits);
-        LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        ComputeMaskedBits(Index, LocalMask,
-                          LocalKnownZero, LocalKnownOne, Depth+1);
-        TrailZ = std::min(TrailZ,
-                          CountTrailingZeros_64(TypeSize) +
-                            LocalKnownZero.countTrailingOnes());
-      }
-    }
-    
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) & Mask;
-    break;
-  }
-  case Instruction::PHI: {
-    PHINode *P = cast<PHINode>(I);
-    // Handle the case of a simple two-predecessor recurrence PHI.
-    // There's a lot more that could theoretically be done here, but
-    // this is sufficient to catch some interesting cases.
-    if (P->getNumIncomingValues() == 2) {
-      for (unsigned i = 0; i != 2; ++i) {
-        Value *L = P->getIncomingValue(i);
-        Value *R = P->getIncomingValue(!i);
-        User *LU = dyn_cast<User>(L);
-        unsigned Opcode = LU ? getOpcode(LU) : (unsigned)Instruction::UserOp1;
-        // Check for operations that have the property that if
-        // both their operands have low zero bits, the result
-        // will have low zero bits.
-        if (Opcode == Instruction::Add ||
-            Opcode == Instruction::Sub ||
-            Opcode == Instruction::And ||
-            Opcode == Instruction::Or ||
-            Opcode == Instruction::Mul) {
-          Value *LL = LU->getOperand(0);
-          Value *LR = LU->getOperand(1);
-          // Find a recurrence.
-          if (LL == I)
-            L = LR;
-          else if (LR == I)
-            L = LL;
-          else
-            break;
-          // Ok, we have a PHI of the form L op= R. Check for low
-          // zero bits.
-          APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-          ComputeMaskedBits(R, Mask2, KnownZero2, KnownOne2, Depth+1);
-          Mask2 = APInt::getLowBitsSet(BitWidth,
-                                       KnownZero2.countTrailingOnes());
-          KnownOne2.clear();
-          KnownZero2.clear();
-          ComputeMaskedBits(L, Mask2, KnownZero2, KnownOne2, Depth+1);
-          KnownZero = Mask &
-                      APInt::getLowBitsSet(BitWidth,
-                                           KnownZero2.countTrailingOnes());
-          break;
-        }
-      }
-    }
-    break;
-  }
-  }
-}
-
-/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
-/// this predicate to simplify operations downstream.  Mask is known to be zero
-/// for bits that V cannot have.
-bool InstCombiner::MaskedValueIsZero(Value *V, const APInt& Mask,
-                                     unsigned Depth) {
-  APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, Depth);
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-  return (KnownZero & Mask) == Mask;
-}
 
 /// ShrinkDemandedConstant - Check to see if the specified operand of the 
 /// specified instruction is a constant integer.  If so, check to see if there
@@ -1232,7 +804,9 @@
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
   APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
   switch (I->getOpcode()) {
-  default: break;
+  default:
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
     if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
@@ -1344,7 +918,7 @@
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
     if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
       Instruction *Or =
-        BinaryOperator::createOr(I->getOperand(0), I->getOperand(1),
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                  I->getName());
       InsertNewInstBefore(Or, *I);
       return UpdateValueUsesWith(I, Or);
@@ -1359,7 +933,7 @@
       if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
         Constant *AndC = ConstantInt::get(~RHSKnownOne & DemandedMask);
         Instruction *And = 
-          BinaryOperator::createAnd(I->getOperand(0), AndC, "tmp");
+          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
         InsertNewInstBefore(And, *I);
         return UpdateValueUsesWith(I, And);
       }
@@ -1518,7 +1092,7 @@
       // Turn it into OR if input bits are zero.
       if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
         Instruction *Or =
-          BinaryOperator::createOr(I->getOperand(0), I->getOperand(1),
+          BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                    I->getName());
         InsertNewInstBefore(Or, *I);
         return UpdateValueUsesWith(I, Or);
@@ -1578,6 +1152,9 @@
                                LHSKnownZero, LHSKnownOne, Depth+1))
         return true;
     }
+    // Otherwise just hand the sub off to ComputeMaskedBits to fill in
+    // the known zeros and ones.
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
     break;
   case Instruction::Shl:
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
@@ -1623,7 +1200,7 @@
     // the shift amount is >= the size of the datatype, which is undefined.
     if (DemandedMask == 1) {
       // Perform the logical shift right.
-      Value *NewVal = BinaryOperator::createLShr(
+      Value *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
       InsertNewInstBefore(cast<Instruction>(NewVal), *I);
       return UpdateValueUsesWith(I, NewVal);
@@ -1661,10 +1238,10 @@
         
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (RHSKnownZero[BitWidth-ShiftAmt-1] || 
+      if (BitWidth <= ShiftAmt || RHSKnownZero[BitWidth-ShiftAmt-1] || 
           (HighBits & ~DemandedMask) == HighBits) {
         // Perform the logical shift right.
-        Value *NewVal = BinaryOperator::createLShr(
+        Value *NewVal = BinaryOperator::CreateLShr(
                           I->getOperand(0), SA, I->getName());
         InsertNewInstBefore(cast<Instruction>(NewVal), *I);
         return UpdateValueUsesWith(I, NewVal);
@@ -1677,7 +1254,7 @@
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       APInt RA = Rem->getValue();
       if (RA.isPowerOf2() || (-RA).isPowerOf2()) {
-        APInt LowBits = RA.isStrictlyPositive() ? (RA - 1) | RA : ~RA;
+        APInt LowBits = RA.isStrictlyPositive() ? (RA - 1) : ~RA;
         APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
         if (SimplifyDemandedBits(I->getOperand(0), Mask2,
                                  LHSKnownZero, LHSKnownOne, Depth+1))
@@ -1695,11 +1272,11 @@
       }
     }
     break;
-  case Instruction::URem:
+  case Instruction::URem: {
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       APInt RA = Rem->getValue();
       if (RA.isPowerOf2()) {
-        APInt LowBits = (RA - 1) | RA;
+        APInt LowBits = (RA - 1);
         APInt Mask2 = LowBits & DemandedMask;
         KnownZero |= ~LowBits & DemandedMask;
         if (SimplifyDemandedBits(I->getOperand(0), Mask2,
@@ -1707,17 +1284,66 @@
           return true;
 
         assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+        break;
       }
-    } else {
-      APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
-      APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-      if (SimplifyDemandedBits(I->getOperand(1), AllOnes,
-                               KnownZero2, KnownOne2, Depth+1))
-        return true;
+    }
+
+    APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I->getOperand(0), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
+
+    uint32_t Leaders = KnownZero2.countLeadingOnes();
+    if (SimplifyDemandedBits(I->getOperand(1), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
 
-      uint32_t Leaders = KnownZero2.countLeadingOnes();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    Leaders = std::max(Leaders,
+                       KnownZero2.countLeadingOnes());
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+          
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+          
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(I->getOperand(1),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(I->getOperand(1),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          InsertNewInstBefore(NewVal, *I);
+          return UpdateValueUsesWith(I, NewVal);
+        }
+          
+        // TODO: Could compute known zero/one bits based on the input.
+        break;
+      }
+      }
     }
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
     break;
   }
   
@@ -1966,12 +1592,12 @@
           default: assert(0 && "Case stmts out of sync!");
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::createSub(LHS, RHS,
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateSub(LHS, RHS,
                                                         II->getName()), *II);
             break;
           case Intrinsic::x86_sse_mul_ss:
           case Intrinsic::x86_sse2_mul_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::createMul(LHS, RHS,
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateMul(LHS, RHS,
                                                          II->getName()), *II);
             break;
           }
@@ -1996,21 +1622,6 @@
   return MadeChange ? I : 0;
 }
 
-/// @returns true if the specified compare predicate is
-/// true when both operands are equal...
-/// @brief Determine if the icmp Predicate is true when both operands are equal
-static bool isTrueWhenEqual(ICmpInst::Predicate pred) {
-  return pred == ICmpInst::ICMP_EQ  || pred == ICmpInst::ICMP_UGE ||
-         pred == ICmpInst::ICMP_SGE || pred == ICmpInst::ICMP_ULE ||
-         pred == ICmpInst::ICMP_SLE;
-}
-
-/// @returns true if the specified compare instruction is
-/// true when both operands are equal...
-/// @brief Determine if the ICmpInst returns true when both operands are equal
-static bool isTrueWhenEqual(ICmpInst &ICI) {
-  return isTrueWhenEqual(ICI.getPredicate());
-}
 
 /// AssociativeOpt - Perform an optimization on an associative operator.  This
 /// function is designed to check a chain of associative operators for a
@@ -2021,7 +1632,7 @@
 /// 'shouldApply' and 'apply' methods.
 ///
 template<typename Functor>
-Instruction *AssociativeOpt(BinaryOperator &Root, const Functor &F) {
+static Instruction *AssociativeOpt(BinaryOperator &Root, const Functor &F) {
   unsigned Opcode = Root.getOpcode();
   Value *LHS = Root.getOperand(0);
 
@@ -2044,8 +1655,6 @@
     // If the functor wants to apply the optimization to the RHS of LHSI,
     // reassociate the expression from ((? op A) op B) to (? op (A op B))
     if (ShouldApply) {
-      BasicBlock *BB = Root.getParent();
-
       // Now all of the instructions are in the current basic block, go ahead
       // and perform the reassociation.
       Instruction *TmpLHSI = cast<Instruction>(Root.getOperand(0));
@@ -2061,9 +1670,8 @@
       }
       Root.replaceAllUsesWith(TmpLHSI);          // Users now use TmpLHSI
       TmpLHSI->setOperand(1, &Root);             // TmpLHSI now uses the root
-      TmpLHSI->getParent()->getInstList().remove(TmpLHSI);
       BasicBlock::iterator ARI = &Root; ++ARI;
-      BB->getInstList().insert(ARI, TmpLHSI);    // Move TmpLHSI to after Root
+      TmpLHSI->moveBefore(ARI);                  // Move TmpLHSI to after Root
       ARI = Root;
 
       // Now propagate the ExtraOperand down the chain of instructions until we
@@ -2072,8 +1680,7 @@
         Instruction *NextLHSI = cast<Instruction>(TmpLHSI->getOperand(0));
         // Move the instruction to immediately before the chain we are
         // constructing to avoid breaking dominance properties.
-        NextLHSI->getParent()->getInstList().remove(NextLHSI);
-        BB->getInstList().insert(ARI, NextLHSI);
+        NextLHSI->moveBefore(ARI);
         ARI = NextLHSI;
 
         Value *NextOp = NextLHSI->getOperand(1);
@@ -2092,6 +1699,7 @@
   return 0;
 }
 
+namespace {
 
 // AddRHS - Implements: X + X --> X << 1
 struct AddRHS {
@@ -2099,8 +1707,8 @@
   AddRHS(Value *rhs) : RHS(rhs) {}
   bool shouldApply(Value *LHS) const { return LHS == RHS; }
   Instruction *apply(BinaryOperator &Add) const {
-    return BinaryOperator::createShl(Add.getOperand(0),
-                                  ConstantInt::get(Add.getType(), 1));
+    return BinaryOperator::CreateShl(Add.getOperand(0),
+                                     ConstantInt::get(Add.getType(), 1));
   }
 };
 
@@ -2115,17 +1723,19 @@
            ConstantExpr::getAnd(C1, C2)->isNullValue();
   }
   Instruction *apply(BinaryOperator &Add) const {
-    return BinaryOperator::createOr(Add.getOperand(0), Add.getOperand(1));
+    return BinaryOperator::CreateOr(Add.getOperand(0), Add.getOperand(1));
   }
 };
 
+}
+
 static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                              InstCombiner *IC) {
   if (CastInst *CI = dyn_cast<CastInst>(&I)) {
     if (Constant *SOC = dyn_cast<Constant>(SO))
       return ConstantExpr::getCast(CI->getOpcode(), SOC, I.getType());
 
-    return IC->InsertNewInstBefore(CastInst::create(
+    return IC->InsertNewInstBefore(CastInst::Create(
           CI->getOpcode(), SO, I.getType(), SO->getName() + ".cast"), I);
   }
 
@@ -2144,9 +1754,9 @@
     std::swap(Op0, Op1);
   Instruction *New;
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
-    New = BinaryOperator::create(BO->getOpcode(), Op0, Op1,SO->getName()+".op");
+    New = BinaryOperator::Create(BO->getOpcode(), Op0, Op1,SO->getName()+".op");
   else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
-    New = CmpInst::create(CI->getOpcode(), CI->getPredicate(), Op0, Op1, 
+    New = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Op0, Op1, 
                           SO->getName()+".cmp");
   else {
     assert(0 && "Unknown binary instruction type!");
@@ -2232,11 +1842,11 @@
       } else {
         assert(PN->getIncomingBlock(i) == NonConstBB);
         if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) 
-          InV = BinaryOperator::create(BO->getOpcode(),
+          InV = BinaryOperator::Create(BO->getOpcode(),
                                        PN->getIncomingValue(i), C, "phitmp",
                                        NonConstBB->getTerminator());
         else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
-          InV = CmpInst::create(CI->getOpcode(), 
+          InV = CmpInst::Create(CI->getOpcode(), 
                                 CI->getPredicate(),
                                 PN->getIncomingValue(i), C, "phitmp",
                                 NonConstBB->getTerminator());
@@ -2256,7 +1866,7 @@
         InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
       } else {
         assert(PN->getIncomingBlock(i) == NonConstBB);
-        InV = CastInst::create(CI->getOpcode(), PN->getIncomingValue(i), 
+        InV = CastInst::Create(CI->getOpcode(), PN->getIncomingValue(i), 
                                I.getType(), "phitmp", 
                                NonConstBB->getTerminator());
         AddToWorkList(cast<Instruction>(InV));
@@ -2268,42 +1878,28 @@
 }
 
 
-/// CannotBeNegativeZero - Return true if we can prove that the specified FP 
-/// value is never equal to -0.0.
-///
-/// Note that this function will need to be revisited when we support nondefault
-/// rounding modes!
-///
-static bool CannotBeNegativeZero(const Value *V) {
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
-    return !CFP->getValueAPF().isNegZero();
-
-  // (add x, 0.0) is guaranteed to return +0.0, not -0.0.
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    if (I->getOpcode() == Instruction::Add &&
-        isa<ConstantFP>(I->getOperand(1)) && 
-        cast<ConstantFP>(I->getOperand(1))->isNullValue())
-      return true;
-    
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      if (II->getIntrinsicID() == Intrinsic::sqrt)
-        return CannotBeNegativeZero(II->getOperand(1));
-    
-    if (const CallInst *CI = dyn_cast<CallInst>(I))
-      if (const Function *F = CI->getCalledFunction()) {
-        if (F->isDeclaration()) {
-          switch (F->getNameLen()) {
-          case 3:  // abs(x) != -0.0
-            if (!strcmp(F->getNameStart(), "abs")) return true;
-            break;
-          case 4:  // abs[lf](x) != -0.0
-            if (!strcmp(F->getNameStart(), "absf")) return true;
-            if (!strcmp(F->getNameStart(), "absl")) return true;
-            break;
-          }
-        }
-      }
-  }
+/// WillNotOverflowSignedAdd - Return true if we can prove that:
+///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this.  Here are some simple
+  // ones.
+  
+  // Add has the property that adding any two 2's complement numbers can only 
+  // have one carry bit which can change a sign.  As such, if LHS and RHS each
+  // have at least two sign bits, we know that the addition of the two values will
+  // sign extend fine.
+  if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
+    return true;
+  
+  
+  // If one of the operands only has one non-zero bit, and if the other operand
+  // has a known-zero bit in a more significant place than it (not including the
+  // sign bit) the ripple may go up to and fill the zero, but won't change the
+  // sign.  For example, (X & ~4) + 1.
+  
+  // TODO: Implement.
   
   return false;
 }
@@ -2333,7 +1929,7 @@
       const APInt& Val = CI->getValue();
       uint32_t BitWidth = Val.getBitWidth();
       if (Val == APInt::getSignBit(BitWidth))
-        return BinaryOperator::createXor(LHS, RHS);
+        return BinaryOperator::CreateXor(LHS, RHS);
       
       // See if SimplifyDemandedBits can simplify this.  This handles stuff like
       // (X & 254)+1 -> (X&254)|1
@@ -2378,9 +1974,9 @@
       } while (Size >= 1);
       
       // FIXME: This shouldn't be necessary. When the backends can handle types
-      // with funny bit widths then this whole cascade of if statements should
-      // be removed. It is just here to get the size of the "middle" type back
-      // up to something that the back ends can handle.
+      // with funny bit widths then this switch statement should be removed. It
+      // is just here to get the size of the "middle" type back up to something
+      // that the back ends can handle.
       const Type *MiddleType = 0;
       switch (Size) {
         default: break;
@@ -2396,8 +1992,11 @@
     }
   }
 
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(LHS, RHS);
+
   // X + X --> X << 1
-  if (I.getType()->isInteger() && I.getType() != Type::Int1Ty) {
+  if (I.getType()->isInteger()) {
     if (Instruction *Result = AssociativeOpt(I, AddRHS(RHS))) return Result;
 
     if (Instruction *RHSI = dyn_cast<Instruction>(RHS)) {
@@ -2417,35 +2016,35 @@
   if (Value *LHSV = dyn_castNegVal(LHS)) {
     if (LHS->getType()->isIntOrIntVector()) {
       if (Value *RHSV = dyn_castNegVal(RHS)) {
-        Instruction *NewAdd = BinaryOperator::createAdd(LHSV, RHSV, "sum");
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSV, RHSV, "sum");
         InsertNewInstBefore(NewAdd, I);
-        return BinaryOperator::createNeg(NewAdd);
+        return BinaryOperator::CreateNeg(NewAdd);
       }
     }
     
-    return BinaryOperator::createSub(RHS, LHSV);
+    return BinaryOperator::CreateSub(RHS, LHSV);
   }
 
   // A + -B  -->  A - B
   if (!isa<Constant>(RHS))
     if (Value *V = dyn_castNegVal(RHS))
-      return BinaryOperator::createSub(LHS, V);
+      return BinaryOperator::CreateSub(LHS, V);
 
 
   ConstantInt *C2;
   if (Value *X = dyn_castFoldableMul(LHS, C2)) {
     if (X == RHS)   // X*C + X --> X * (C+1)
-      return BinaryOperator::createMul(RHS, AddOne(C2));
+      return BinaryOperator::CreateMul(RHS, AddOne(C2));
 
     // X*C1 + X*C2 --> X * (C1+C2)
     ConstantInt *C1;
     if (X == dyn_castFoldableMul(RHS, C1))
-      return BinaryOperator::createMul(X, Add(C1, C2));
+      return BinaryOperator::CreateMul(X, Add(C1, C2));
   }
 
   // X + X*C --> X * (C+1)
   if (dyn_castFoldableMul(RHS, C2) == LHS)
-    return BinaryOperator::createMul(LHS, AddOne(C2));
+    return BinaryOperator::CreateMul(LHS, AddOne(C2));
 
   // X + ~X --> -1   since   ~X = -X-1
   if (dyn_castNotVal(LHS) == RHS || dyn_castNotVal(RHS) == LHS)
@@ -2456,6 +2055,23 @@
   if (match(RHS, m_And(m_Value(), m_ConstantInt(C2))))
     if (Instruction *R = AssociativeOpt(I, AddMaskingAnd(C2)))
       return R;
+  
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+    APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
+    APInt LHSKnownOne(IT->getBitWidth(), 0);
+    APInt LHSKnownZero(IT->getBitWidth(), 0);
+    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    if (LHSKnownZero != 0) {
+      APInt RHSKnownOne(IT->getBitWidth(), 0);
+      APInt RHSKnownZero(IT->getBitWidth(), 0);
+      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      
+      // No bits in common -> bitwise or.
+      if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
+        return BinaryOperator::CreateOr(LHS, RHS);
+    }
+  }
 
   // W*X + Y*Z --> W * (X+Z)  iff W == Y
   if (I.getType()->isIntOrIntVector()) {
@@ -2474,9 +2090,9 @@
       }
 
       if (W == Y) {
-        Value *NewAdd = InsertNewInstBefore(BinaryOperator::createAdd(X, Z,
+        Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, Z,
                                                             LHS->getName()), I);
-        return BinaryOperator::createMul(W, NewAdd);
+        return BinaryOperator::CreateMul(W, NewAdd);
       }
     }
   }
@@ -2484,7 +2100,7 @@
   if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
     Value *X = 0;
     if (match(LHS, m_Not(m_Value(X))))    // ~X + C --> (C-1) - X
-      return BinaryOperator::createSub(SubOne(CRHS), X);
+      return BinaryOperator::CreateSub(SubOne(CRHS), X);
 
     // (X & FF00) + xx00  -> (X+xx00) & FF00
     if (LHS->hasOneUse() && match(LHS, m_And(m_Value(X), m_ConstantInt(C2)))) {
@@ -2502,9 +2118,9 @@
 
         if (AddRHSHighBits == AddRHSHighBitsAnd) {
           // Okay, the xform is safe.  Insert the new add pronto.
-          Value *NewAdd = InsertNewInstBefore(BinaryOperator::createAdd(X, CRHS,
+          Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, CRHS,
                                                             LHS->getName()), I);
-          return BinaryOperator::createAnd(NewAdd, C2);
+          return BinaryOperator::CreateAnd(NewAdd, C2);
         }
       }
     }
@@ -2566,16 +2182,87 @@
     if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
       return ReplaceInstUsesWith(I, LHS);
 
+  // Check for (add (sext x), y), see if we can merge this into an
+  // integer add followed by a sext.
+  if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
+    // (add (sext x), cst) --> (sext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      Constant *CI = 
+        ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new, smaller add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add (sext x), (sext y)) --> (sext (add int x, y))
+    if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
+  // Check for (add double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    // (add double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
+      Constant *CI = 
+      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of int->fp conversions),
+      // and if the integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
   return Changed ? &I : 0;
 }
 
-// isSignBit - Return true if the value represented by the constant only has the
-// highest order bit set.
-static bool isSignBit(ConstantInt *CI) {
-  uint32_t NumBits = CI->getType()->getPrimitiveSizeInBits();
-  return CI->getValue() == APInt::getSignBit(NumBits);
-}
-
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -2584,7 +2271,7 @@
 
   // If this is a 'B = x-(-A)', change to B = x+A...
   if (Value *V = dyn_castNegVal(Op1))
-    return BinaryOperator::createAdd(Op0, V);
+    return BinaryOperator::CreateAdd(Op0, V);
 
   if (isa<UndefValue>(Op0))
     return ReplaceInstUsesWith(I, Op0);    // undef - X -> undef
@@ -2594,12 +2281,12 @@
   if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
     // Replace (-1 - A) with (~A)...
     if (C->isAllOnesValue())
-      return BinaryOperator::createNot(Op1);
+      return BinaryOperator::CreateNot(Op1);
 
     // C - ~X == X + (1+C)
     Value *X = 0;
     if (match(Op1, m_Not(m_Value(X))))
-      return BinaryOperator::createAdd(X, AddOne(C));
+      return BinaryOperator::CreateAdd(X, AddOne(C));
 
     // -(X >>u 31) -> (X >>s 31)
     // -(X >>s 31) -> (X >>u 31)
@@ -2611,7 +2298,7 @@
             if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
                 SI->getType()->getPrimitiveSizeInBits()-1) {
               // Ok, the transformation is safe.  Insert AShr.
-              return BinaryOperator::create(Instruction::AShr, 
+              return BinaryOperator::Create(Instruction::AShr, 
                                           SI->getOperand(0), CU, SI->getName());
             }
           }
@@ -2622,7 +2309,7 @@
             if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
                 SI->getType()->getPrimitiveSizeInBits()-1) {
               // Ok, the transformation is safe.  Insert LShr. 
-              return BinaryOperator::createLShr(
+              return BinaryOperator::CreateLShr(
                                           SI->getOperand(0), CU, SI->getName());
             }
           }
@@ -2640,17 +2327,20 @@
         return NV;
   }
 
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(Op0, Op1);
+
   if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
     if (Op1I->getOpcode() == Instruction::Add &&
         !Op0->getType()->isFPOrFPVector()) {
       if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
-        return BinaryOperator::createNeg(Op1I->getOperand(1), I.getName());
+        return BinaryOperator::CreateNeg(Op1I->getOperand(1), I.getName());
       else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
-        return BinaryOperator::createNeg(Op1I->getOperand(0), I.getName());
+        return BinaryOperator::CreateNeg(Op1I->getOperand(0), I.getName());
       else if (ConstantInt *CI1 = dyn_cast<ConstantInt>(I.getOperand(0))) {
         if (ConstantInt *CI2 = dyn_cast<ConstantInt>(Op1I->getOperand(1)))
           // C1-(X+C2) --> (C1-C2)-X
-          return BinaryOperator::createSub(Subtract(CI1, CI2), 
+          return BinaryOperator::CreateSub(Subtract(CI1, CI2), 
                                            Op1I->getOperand(0));
       }
     }
@@ -2667,7 +2357,7 @@
         Op1I->setOperand(1, IIOp0);
 
         // Create the new top level add instruction...
-        return BinaryOperator::createAdd(Op0, Op1);
+        return BinaryOperator::CreateAdd(Op0, Op1);
       }
 
       // Replace (A - (A & B)) with (A & ~B) if this is the only use of (A&B)...
@@ -2677,8 +2367,8 @@
         Value *OtherOp = Op1I->getOperand(Op1I->getOperand(0) == Op0);
 
         Value *NewNot =
-          InsertNewInstBefore(BinaryOperator::createNot(OtherOp, "B.not"), I);
-        return BinaryOperator::createAnd(Op0, NewNot);
+          InsertNewInstBefore(BinaryOperator::CreateNot(OtherOp, "B.not"), I);
+        return BinaryOperator::CreateAnd(Op0, NewNot);
       }
 
       // 0 - (X sdiv C)  -> (X sdiv -C)
@@ -2686,14 +2376,14 @@
         if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
           if (CSI->isZero())
             if (Constant *DivRHS = dyn_cast<Constant>(Op1I->getOperand(1)))
-              return BinaryOperator::createSDiv(Op1I->getOperand(0),
+              return BinaryOperator::CreateSDiv(Op1I->getOperand(0),
                                                ConstantExpr::getNeg(DivRHS));
 
       // X - X*C --> X * (1-C)
       ConstantInt *C2 = 0;
       if (dyn_castFoldableMul(Op1I, C2) == Op0) {
         Constant *CP1 = Subtract(ConstantInt::get(I.getType(), 1), C2);
-        return BinaryOperator::createMul(Op0, CP1);
+        return BinaryOperator::CreateMul(Op0, CP1);
       }
 
       // X - ((X / Y) * Y) --> X % Y
@@ -2702,9 +2392,9 @@
           if (Op0 == I->getOperand(0) &&
               Op1I->getOperand(1) == I->getOperand(1)) {
             if (I->getOpcode() == Instruction::SDiv)
-              return BinaryOperator::createSRem(Op0, Op1I->getOperand(1));
+              return BinaryOperator::CreateSRem(Op0, Op1I->getOperand(1));
             if (I->getOpcode() == Instruction::UDiv)
-              return BinaryOperator::createURem(Op0, Op1I->getOperand(1));
+              return BinaryOperator::CreateURem(Op0, Op1I->getOperand(1));
           }
     }
   }
@@ -2718,18 +2408,18 @@
           return ReplaceInstUsesWith(I, Op0I->getOperand(0));
       } else if (Op0I->getOpcode() == Instruction::Sub) {
         if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
-          return BinaryOperator::createNeg(Op0I->getOperand(1), I.getName());
+          return BinaryOperator::CreateNeg(Op0I->getOperand(1), I.getName());
       }
     }
 
   ConstantInt *C1;
   if (Value *X = dyn_castFoldableMul(Op0, C1)) {
     if (X == Op1)  // X*C - X --> X * (C-1)
-      return BinaryOperator::createMul(Op1, SubOne(C1));
+      return BinaryOperator::CreateMul(Op1, SubOne(C1));
 
     ConstantInt *C2;   // X*C1 - X*C2 -> X * (C1-C2)
     if (X == dyn_castFoldableMul(Op1, C2))
-      return BinaryOperator::createMul(X, Subtract(C1, C2));
+      return BinaryOperator::CreateMul(X, Subtract(C1, C2));
   }
   return 0;
 }
@@ -2758,8 +2448,7 @@
   case ICmpInst::ICMP_UGE: 
     // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
     TrueIfSigned = true;
-    return RHS->getValue() == 
-      APInt::getSignBit(RHS->getType()->getPrimitiveSizeInBits());
+    return RHS->getValue().isSignBit();
   default:
     return false;
   }
@@ -2780,7 +2469,7 @@
       if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
         if (SI->getOpcode() == Instruction::Shl)
           if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
-            return BinaryOperator::createMul(SI->getOperand(0),
+            return BinaryOperator::CreateMul(SI->getOperand(0),
                                              ConstantExpr::getShl(CI, ShOp));
 
       if (CI->isZero())
@@ -2788,11 +2477,11 @@
       if (CI->equalsInt(1))                  // X * 1  == X
         return ReplaceInstUsesWith(I, Op0);
       if (CI->isAllOnesValue())              // X * -1 == 0 - X
-        return BinaryOperator::createNeg(Op0, I.getName());
+        return BinaryOperator::CreateNeg(Op0, I.getName());
 
       const APInt& Val = cast<ConstantInt>(CI)->getValue();
       if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
-        return BinaryOperator::createShl(Op0,
+        return BinaryOperator::CreateShl(Op0,
                  ConstantInt::get(Op0->getType(), Val.logBase2()));
       }
     } else if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
@@ -2809,14 +2498,14 @@
     
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
       if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
-          isa<ConstantInt>(Op0I->getOperand(1))) {
+          isa<ConstantInt>(Op0I->getOperand(1)) && isa<ConstantInt>(Op1)) {
         // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
-        Instruction *Add = BinaryOperator::createMul(Op0I->getOperand(0),
+        Instruction *Add = BinaryOperator::CreateMul(Op0I->getOperand(0),
                                                      Op1, "tmp");
         InsertNewInstBefore(Add, I);
         Value *C1C2 = ConstantExpr::getMul(Op1, 
                                            cast<Constant>(Op0I->getOperand(1)));
-        return BinaryOperator::createAdd(Add, C1C2);
+        return BinaryOperator::CreateAdd(Add, C1C2);
         
       }
 
@@ -2832,14 +2521,17 @@
 
   if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
     if (Value *Op1v = dyn_castNegVal(I.getOperand(1)))
-      return BinaryOperator::createMul(Op0v, Op1v);
+      return BinaryOperator::CreateMul(Op0v, Op1v);
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateAnd(Op0, I.getOperand(1));
 
   // If one of the operands of the multiply is a cast from a boolean value, then
   // we know the bool is either zero or one, so this is a 'masking' multiply.
   // See if we can simplify things based on how the boolean was originally
   // formed.
   CastInst *BoolCast = 0;
-  if (ZExtInst *CI = dyn_cast<ZExtInst>(I.getOperand(0)))
+  if (ZExtInst *CI = dyn_cast<ZExtInst>(Op0))
     if (CI->getOperand(0)->getType() == Type::Int1Ty)
       BoolCast = CI;
   if (!BoolCast)
@@ -2862,7 +2554,7 @@
                                           SCOpTy->getPrimitiveSizeInBits()-1);
         Value *V =
           InsertNewInstBefore(
-            BinaryOperator::create(Instruction::AShr, SCIOp0, Amt,
+            BinaryOperator::Create(Instruction::AShr, SCIOp0, Amt,
                                             BoolCast->getOperand(0)->getName()+
                                             ".mask"), I);
 
@@ -2878,7 +2570,7 @@
         }
 
         Value *OtherOp = Op0 == BoolCast ? I.getOperand(1) : Op0;
-        return BinaryOperator::createAnd(V, OtherOp);
+        return BinaryOperator::CreateAnd(V, OtherOp);
       }
     }
   }
@@ -2949,6 +2641,18 @@
 Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  // (sdiv X, X) --> 1     (udiv X, X) --> 1
+  if (Op0 == Op1) {
+    if (const VectorType *Ty = dyn_cast<VectorType>(I.getType())) {
+      ConstantInt *CI = ConstantInt::get(Ty->getElementType(), 1);
+      std::vector<Constant*> Elts(Ty->getNumElements(), CI);
+      return ReplaceInstUsesWith(I, ConstantVector::get(Elts));
+    }
+
+    ConstantInt *CI = ConstantInt::get(I.getType(), 1);
+    return ReplaceInstUsesWith(I, CI);
+  }
+  
   if (Instruction *Common = commonDivTransforms(I))
     return Common;
 
@@ -2964,7 +2668,7 @@
           if (MultiplyOverflows(RHS, LHSRHS, I.getOpcode()==Instruction::SDiv))
             return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
           else 
-            return BinaryOperator::create(I.getOpcode(), LHS->getOperand(0),
+            return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
                                           Multiply(RHS, LHSRHS));
         }
 
@@ -2983,6 +2687,10 @@
     if (LHS->equalsInt(0))
       return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
 
+  // It can't be division by zero, hence it must be division by one.
+  if (I.getType() == Type::Int1Ty)
+    return ReplaceInstUsesWith(I, Op0);
+
   return 0;
 }
 
@@ -2998,7 +2706,7 @@
   // if so, convert to a right shift.
   if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
     if (C->getValue().isPowerOf2())  // 0 not included in isPowerOf2
-      return BinaryOperator::createLShr(Op0, 
+      return BinaryOperator::CreateLShr(Op0, 
                ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
   }
 
@@ -3012,9 +2720,9 @@
         const Type *NTy = N->getType();
         if (uint32_t C2 = C1.logBase2()) {
           Constant *C2V = ConstantInt::get(NTy, C2);
-          N = InsertNewInstBefore(BinaryOperator::createAdd(N, C2V, "tmp"), I);
+          N = InsertNewInstBefore(BinaryOperator::CreateAdd(N, C2V, "tmp"), I);
         }
-        return BinaryOperator::createLShr(Op0, N);
+        return BinaryOperator::CreateLShr(Op0, N);
       }
     }
   }
@@ -3030,13 +2738,13 @@
           uint32_t TSA = TVA.logBase2(), FSA = FVA.logBase2();
           // Construct the "on true" case of the select
           Constant *TC = ConstantInt::get(Op0->getType(), TSA);
-          Instruction *TSI = BinaryOperator::createLShr(
+          Instruction *TSI = BinaryOperator::CreateLShr(
                                                  Op0, TC, SI->getName()+".t");
           TSI = InsertNewInstBefore(TSI, I);
   
           // Construct the "on false" case of the select
           Constant *FC = ConstantInt::get(Op0->getType(), FSA); 
-          Instruction *FSI = BinaryOperator::createLShr(
+          Instruction *FSI = BinaryOperator::CreateLShr(
                                                  Op0, FC, SI->getName()+".f");
           FSI = InsertNewInstBefore(FSI, I);
 
@@ -3057,11 +2765,11 @@
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     // sdiv X, -1 == -X
     if (RHS->isAllOnesValue())
-      return BinaryOperator::createNeg(Op0);
+      return BinaryOperator::CreateNeg(Op0);
 
     // -X/C -> X/-C
     if (Value *LHSNeg = dyn_castNegVal(Op0))
-      return BinaryOperator::createSDiv(LHSNeg, ConstantExpr::getNeg(RHS));
+      return BinaryOperator::CreateSDiv(LHSNeg, ConstantExpr::getNeg(RHS));
   }
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
@@ -3070,7 +2778,7 @@
     APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
     if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
       // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
-      return BinaryOperator::createUDiv(Op0, Op1, I.getName());
+      return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
     }
   }      
   
@@ -3187,7 +2895,7 @@
     // if so, convert to a bitwise and.
     if (ConstantInt *C = dyn_cast<ConstantInt>(RHS))
       if (C->getValue().isPowerOf2())
-        return BinaryOperator::createAnd(Op0, SubOne(C));
+        return BinaryOperator::CreateAnd(Op0, SubOne(C));
   }
 
   if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
@@ -3196,9 +2904,9 @@
         isa<ConstantInt>(RHSI->getOperand(0))) {
       if (cast<ConstantInt>(RHSI->getOperand(0))->getValue().isPowerOf2()) {
         Constant *N1 = ConstantInt::getAllOnesValue(I.getType());
-        Value *Add = InsertNewInstBefore(BinaryOperator::createAdd(RHSI, N1,
+        Value *Add = InsertNewInstBefore(BinaryOperator::CreateAdd(RHSI, N1,
                                                                    "tmp"), I);
-        return BinaryOperator::createAnd(Op0, Add);
+        return BinaryOperator::CreateAnd(Op0, Add);
       }
     }
   }
@@ -3212,9 +2920,9 @@
         if ((STO->getValue().isPowerOf2()) && 
             (SFO->getValue().isPowerOf2())) {
           Value *TrueAnd = InsertNewInstBefore(
-            BinaryOperator::createAnd(Op0, SubOne(STO), SI->getName()+".t"), I);
+            BinaryOperator::CreateAnd(Op0, SubOne(STO), SI->getName()+".t"), I);
           Value *FalseAnd = InsertNewInstBefore(
-            BinaryOperator::createAnd(Op0, SubOne(SFO), SI->getName()+".f"), I);
+            BinaryOperator::CreateAnd(Op0, SubOne(SFO), SI->getName()+".f"), I);
           return SelectInst::Create(SI->getOperand(0), TrueAnd, FalseAnd);
         }
       }
@@ -3245,7 +2953,7 @@
     APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
     if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
       // X srem Y -> X urem Y, iff X and Y don't have sign bit set
-      return BinaryOperator::createURem(Op0, Op1, I.getName());
+      return BinaryOperator::CreateURem(Op0, Op1, I.getName());
     }
   }
 
@@ -3433,10 +3141,10 @@
   case Instruction::Xor:
     if (Op->hasOneUse()) {
       // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
-      Instruction *And = BinaryOperator::createAnd(X, AndRHS);
+      Instruction *And = BinaryOperator::CreateAnd(X, AndRHS);
       InsertNewInstBefore(And, TheAnd);
       And->takeName(Op);
-      return BinaryOperator::createXor(And, Together);
+      return BinaryOperator::CreateXor(And, Together);
     }
     break;
   case Instruction::Or:
@@ -3445,10 +3153,10 @@
 
     if (Op->hasOneUse() && Together != OpRHS) {
       // (X | C1) & C2 --> (X | (C1&C2)) & C2
-      Instruction *Or = BinaryOperator::createOr(X, Together);
+      Instruction *Or = BinaryOperator::CreateOr(X, Together);
       InsertNewInstBefore(Or, TheAnd);
       Or->takeName(Op);
-      return BinaryOperator::createAnd(Or, AndRHS);
+      return BinaryOperator::CreateAnd(Or, AndRHS);
     }
     break;
   case Instruction::Add:
@@ -3476,10 +3184,10 @@
             return &TheAnd;
           } else {
             // Pull the XOR out of the AND.
-            Instruction *NewAnd = BinaryOperator::createAnd(X, AndRHS);
+            Instruction *NewAnd = BinaryOperator::CreateAnd(X, AndRHS);
             InsertNewInstBefore(NewAnd, TheAnd);
             NewAnd->takeName(Op);
-            return BinaryOperator::createXor(NewAnd, AndRHS);
+            return BinaryOperator::CreateXor(NewAnd, AndRHS);
           }
         }
       }
@@ -3538,9 +3246,9 @@
         // Make the argument unsigned.
         Value *ShVal = Op->getOperand(0);
         ShVal = InsertNewInstBefore(
-            BinaryOperator::createLShr(ShVal, OpRHS, 
+            BinaryOperator::CreateLShr(ShVal, OpRHS, 
                                    Op->getName()), TheAnd);
-        return BinaryOperator::createAnd(ShVal, AndRHS, TheAnd.getName());
+        return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
       }
     }
     break;
@@ -3574,7 +3282,7 @@
 
     // Emit V-Lo <u Hi-Lo
     Constant *NegLo = ConstantExpr::getNeg(Lo);
-    Instruction *Add = BinaryOperator::createAdd(V, NegLo, V->getName()+".off");
+    Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
     InsertNewInstBefore(Add, IB);
     Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
     return new ICmpInst(ICmpInst::ICMP_ULT, Add, UpperBound);
@@ -3594,7 +3302,7 @@
   // Emit V-Lo >u Hi-1-Lo
   // Note that Hi has already had one subtracted from it, above.
   ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
-  Instruction *Add = BinaryOperator::createAdd(V, NegLo, V->getName()+".off");
+  Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
   InsertNewInstBefore(Add, IB);
   Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
   return new ICmpInst(ICmpInst::ICMP_UGT, Add, LowerBound);
@@ -3669,9 +3377,9 @@
   
   Instruction *New;
   if (isSub)
-    New = BinaryOperator::createSub(LHSI->getOperand(0), RHS, "fold");
+    New = BinaryOperator::CreateSub(LHSI->getOperand(0), RHS, "fold");
   else
-    New = BinaryOperator::createAdd(LHSI->getOperand(0), RHS, "fold");
+    New = BinaryOperator::CreateAdd(LHSI->getOperand(0), RHS, "fold");
   return InsertNewInstBefore(New, I);
 }
 
@@ -3719,19 +3427,19 @@
         if (Op0I->hasOneUse()) {
           if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
             // Not masking anything out for the LHS, move to RHS.
-            Instruction *NewRHS = BinaryOperator::createAnd(Op0RHS, AndRHS,
+            Instruction *NewRHS = BinaryOperator::CreateAnd(Op0RHS, AndRHS,
                                                    Op0RHS->getName()+".masked");
             InsertNewInstBefore(NewRHS, I);
-            return BinaryOperator::create(
+            return BinaryOperator::Create(
                        cast<BinaryOperator>(Op0I)->getOpcode(), Op0LHS, NewRHS);
           }
           if (!isa<Constant>(Op0RHS) &&
               MaskedValueIsZero(Op0RHS, NotAndRHS)) {
             // Not masking anything out for the RHS, move to LHS.
-            Instruction *NewLHS = BinaryOperator::createAnd(Op0LHS, AndRHS,
+            Instruction *NewLHS = BinaryOperator::CreateAnd(Op0LHS, AndRHS,
                                                    Op0LHS->getName()+".masked");
             InsertNewInstBefore(NewLHS, I);
-            return BinaryOperator::create(
+            return BinaryOperator::Create(
                        cast<BinaryOperator>(Op0I)->getOpcode(), NewLHS, Op0RHS);
           }
         }
@@ -3742,9 +3450,9 @@
         // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
         // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
         if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
-          return BinaryOperator::createAnd(V, AndRHS);
+          return BinaryOperator::CreateAnd(V, AndRHS);
         if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
-          return BinaryOperator::createAnd(V, AndRHS);  // Add commutes
+          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
         break;
 
       case Instruction::Sub:
@@ -3752,7 +3460,7 @@
         // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
         // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
         if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
-          return BinaryOperator::createAnd(V, AndRHS);
+          return BinaryOperator::CreateAnd(V, AndRHS);
         break;
       }
 
@@ -3772,14 +3480,14 @@
               // into  : and (cast X to T), trunc_or_bitcast(C1)&C2
               // This will fold the two constants together, which may allow 
               // other simplifications.
-              Instruction *NewCast = CastInst::createTruncOrBitCast(
+              Instruction *NewCast = CastInst::CreateTruncOrBitCast(
                 CastOp->getOperand(0), I.getType(), 
                 CastOp->getName()+".shrunk");
               NewCast = InsertNewInstBefore(NewCast, I);
               // trunc_or_bitcast(C1)&C2
               Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
               C3 = ConstantExpr::getAnd(C3, AndRHS);
-              return BinaryOperator::createAnd(NewCast, C3);
+              return BinaryOperator::CreateAnd(NewCast, C3);
             } else if (CastOp->getOpcode() == Instruction::Or) {
               // Change: and (cast (or X, C1) to T), C2
               // into  : trunc(C1)&C2 iff trunc(C1)&C2 == C2
@@ -3808,10 +3516,10 @@
 
   // (~A & ~B) == (~(A | B)) - De Morgan's Law
   if (Op0NotVal && Op1NotVal && isOnlyUse(Op0) && isOnlyUse(Op1)) {
-    Instruction *Or = BinaryOperator::createOr(Op0NotVal, Op1NotVal,
+    Instruction *Or = BinaryOperator::CreateOr(Op0NotVal, Op1NotVal,
                                                I.getName()+".demorgan");
     InsertNewInstBefore(Or, I);
-    return BinaryOperator::createNot(Or);
+    return BinaryOperator::CreateNot(Or);
   }
   
   {
@@ -3823,7 +3531,7 @@
       // (A|B) & ~(A&B) -> A^B
       if (match(Op1, m_Not(m_And(m_Value(C), m_Value(D))))) {
         if ((A == C && B == D) || (A == D && B == C))
-          return BinaryOperator::createXor(A, B);
+          return BinaryOperator::CreateXor(A, B);
       }
     }
     
@@ -3834,7 +3542,7 @@
       // ~(A&B) & (A|B) -> A^B
       if (match(Op0, m_Not(m_And(m_Value(C), m_Value(D))))) {
         if ((A == C && B == D) || (A == D && B == C))
-          return BinaryOperator::createXor(A, B);
+          return BinaryOperator::CreateXor(A, B);
       }
     }
     
@@ -3856,9 +3564,9 @@
         std::swap(A, B);
       }
       if (A == Op0) {                                // A&(A^B) -> A & ~B
-        Instruction *NotB = BinaryOperator::createNot(B, "tmp");
+        Instruction *NotB = BinaryOperator::CreateNot(B, "tmp");
         InsertNewInstBefore(NotB, I);
-        return BinaryOperator::createAnd(A, NotB);
+        return BinaryOperator::CreateAnd(A, NotB);
       }
     }
   }
@@ -3941,7 +3649,7 @@
             case ICmpInst::ICMP_NE:
               if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
                 Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-                Instruction *Add = BinaryOperator::createAdd(LHSVal, AddCST,
+                Instruction *Add = BinaryOperator::CreateAdd(LHSVal, AddCST,
                                                       LHSVal->getName()+".off");
                 InsertNewInstBefore(Add, I);
                 return new ICmpInst(ICmpInst::ICMP_UGT, Add,
@@ -3983,8 +3691,7 @@
           case ICmpInst::ICMP_UGT:
             switch (RHSCC) {
             default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X > 13
-              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
             case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
               return ReplaceInstUsesWith(I, RHS);
             case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
@@ -4034,11 +3741,11 @@
                               I.getType(), TD) &&
             ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
                               I.getType(), TD)) {
-          Instruction *NewOp = BinaryOperator::createAnd(Op0C->getOperand(0),
+          Instruction *NewOp = BinaryOperator::CreateAnd(Op0C->getOperand(0),
                                                          Op1C->getOperand(0),
                                                          I.getName());
           InsertNewInstBefore(NewOp, I);
-          return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
         }
       }
     
@@ -4049,10 +3756,10 @@
           SI0->getOperand(1) == SI1->getOperand(1) &&
           (SI0->hasOneUse() || SI1->hasOneUse())) {
         Instruction *NewOp =
-          InsertNewInstBefore(BinaryOperator::createAnd(SI0->getOperand(0),
+          InsertNewInstBefore(BinaryOperator::CreateAnd(SI0->getOperand(0),
                                                         SI1->getOperand(0),
                                                         SI0->getName()), I);
-        return BinaryOperator::create(SI1->getOpcode(), NewOp, 
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
                                       SI1->getOperand(1));
       }
   }
@@ -4224,19 +3931,19 @@
     ConstantInt *C1 = 0; Value *X = 0;
     // (X & C1) | C2 --> (X | C2) & (C1|C2)
     if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
-      Instruction *Or = BinaryOperator::createOr(X, RHS);
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
       InsertNewInstBefore(Or, I);
       Or->takeName(Op0);
-      return BinaryOperator::createAnd(Or, 
+      return BinaryOperator::CreateAnd(Or, 
                ConstantInt::get(RHS->getValue() | C1->getValue()));
     }
 
     // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
     if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
-      Instruction *Or = BinaryOperator::createOr(X, RHS);
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
       InsertNewInstBefore(Or, I);
       Or->takeName(Op0);
-      return BinaryOperator::createXor(Or,
+      return BinaryOperator::CreateXor(Or,
                  ConstantInt::get(C1->getValue() & ~RHS->getValue()));
     }
 
@@ -4272,19 +3979,19 @@
   // (X^C)|Y -> (X|Y)^C iff Y&C == 0
   if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
       MaskedValueIsZero(Op1, C1->getValue())) {
-    Instruction *NOr = BinaryOperator::createOr(A, Op1);
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op1);
     InsertNewInstBefore(NOr, I);
     NOr->takeName(Op0);
-    return BinaryOperator::createXor(NOr, C1);
+    return BinaryOperator::CreateXor(NOr, C1);
   }
 
   // Y|(X^C) -> (X|Y)^C iff Y&C == 0
   if (Op1->hasOneUse() && match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
       MaskedValueIsZero(Op0, C1->getValue())) {
-    Instruction *NOr = BinaryOperator::createOr(A, Op0);
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op0);
     InsertNewInstBefore(NOr, I);
     NOr->takeName(Op0);
-    return BinaryOperator::createXor(NOr, C1);
+    return BinaryOperator::CreateXor(NOr, C1);
   }
 
   // (A & C)|(B & D)
@@ -4334,8 +4041,8 @@
       
       if (V1) {
         Value *Or =
-          InsertNewInstBefore(BinaryOperator::createOr(V2, V3, "tmp"), I);
-        return BinaryOperator::createAnd(V1, Or);
+          InsertNewInstBefore(BinaryOperator::CreateOr(V2, V3, "tmp"), I);
+        return BinaryOperator::CreateAnd(V1, Or);
       }
     }
   }
@@ -4347,10 +4054,10 @@
           SI0->getOperand(1) == SI1->getOperand(1) &&
           (SI0->hasOneUse() || SI1->hasOneUse())) {
         Instruction *NewOp =
-        InsertNewInstBefore(BinaryOperator::createOr(SI0->getOperand(0),
+        InsertNewInstBefore(BinaryOperator::CreateOr(SI0->getOperand(0),
                                                      SI1->getOperand(0),
                                                      SI0->getName()), I);
-        return BinaryOperator::create(SI1->getOpcode(), NewOp, 
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
                                       SI1->getOperand(1));
       }
   }
@@ -4368,9 +4075,9 @@
 
     // (~A | ~B) == (~(A & B)) - De Morgan's Law
     if (A && isOnlyUse(Op0) && isOnlyUse(Op1)) {
-      Value *And = InsertNewInstBefore(BinaryOperator::createAnd(A, B,
+      Value *And = InsertNewInstBefore(BinaryOperator::CreateAnd(A, B,
                                               I.getName()+".demorgan"), I);
-      return BinaryOperator::createNot(And);
+      return BinaryOperator::CreateNot(And);
     }
   }
 
@@ -4422,7 +4129,7 @@
             case ICmpInst::ICMP_EQ:
               if (LHSCst == SubOne(RHSCst)) {// (X == 13 | X == 14) -> X-13 <u 2
                 Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-                Instruction *Add = BinaryOperator::createAdd(LHSVal, AddCST,
+                Instruction *Add = BinaryOperator::CreateAdd(LHSVal, AddCST,
                                                       LHSVal->getName()+".off");
                 InsertNewInstBefore(Add, I);
                 AddCST = Subtract(AddOne(RHSCst), LHSCst);
@@ -4541,11 +4248,11 @@
                                 I.getType(), TD) &&
               ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
                                 I.getType(), TD)) {
-            Instruction *NewOp = BinaryOperator::createOr(Op0C->getOperand(0),
+            Instruction *NewOp = BinaryOperator::CreateOr(Op0C->getOperand(0),
                                                           Op1C->getOperand(0),
                                                           I.getName());
             InsertNewInstBefore(NewOp, I);
-            return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+            return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
           }
         }
       }
@@ -4576,6 +4283,8 @@
   return Changed ? &I : 0;
 }
 
+namespace {
+
 // XorSelf - Implements: X ^ X --> 0
 struct XorSelf {
   Value *RHS;
@@ -4586,6 +4295,7 @@
   }
 };
 
+}
 
 Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   bool Changed = SimplifyCommutative(I);
@@ -4627,13 +4337,13 @@
         if (dyn_castNotVal(Op0I->getOperand(1))) Op0I->swapOperands();
         if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
           Instruction *NotY =
-            BinaryOperator::createNot(Op0I->getOperand(1),
+            BinaryOperator::CreateNot(Op0I->getOperand(1),
                                       Op0I->getOperand(1)->getName()+".not");
           InsertNewInstBefore(NotY, I);
           if (Op0I->getOpcode() == Instruction::And)
-            return BinaryOperator::createOr(Op0NotVal, NotY);
+            return BinaryOperator::CreateOr(Op0NotVal, NotY);
           else
-            return BinaryOperator::createAnd(Op0NotVal, NotY);
+            return BinaryOperator::CreateAnd(Op0NotVal, NotY);
         }
       }
     }
@@ -4652,6 +4362,25 @@
                             FCI->getOperand(0), FCI->getOperand(1));
     }
 
+    // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
+    if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
+        if (CI->hasOneUse() && Op0C->hasOneUse()) {
+          Instruction::CastOps Opcode = Op0C->getOpcode();
+          if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+            if (RHS == ConstantExpr::getCast(Opcode, ConstantInt::getTrue(),
+                                             Op0C->getDestTy())) {
+              Instruction *NewCI = InsertNewInstBefore(CmpInst::Create(
+                                     CI->getOpcode(), CI->getInversePredicate(),
+                                     CI->getOperand(0), CI->getOperand(1)), I);
+              NewCI->takeName(CI);
+              return CastInst::Create(Opcode, NewCI, Op0C->getType());
+            }
+          }
+        }
+      }
+    }
+
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
       // ~(c-X) == X-c-1 == X+(-c-1)
       if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
@@ -4659,7 +4388,7 @@
           Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
           Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
                                               ConstantInt::get(I.getType(), 1));
-          return BinaryOperator::createAdd(Op0I->getOperand(1), ConstantRHS);
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
         }
           
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
@@ -4667,14 +4396,14 @@
           // ~(X-c) --> (-c-1)-X
           if (RHS->isAllOnesValue()) {
             Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
-            return BinaryOperator::createSub(
+            return BinaryOperator::CreateSub(
                            ConstantExpr::getSub(NegOp0CI,
                                              ConstantInt::get(I.getType(), 1)),
                                           Op0I->getOperand(0));
           } else if (RHS->getValue().isSignBit()) {
             // (X + C) ^ signbit -> (X + C + signbit)
             Constant *C = ConstantInt::get(RHS->getValue() + Op0CI->getValue());
-            return BinaryOperator::createAdd(Op0I->getOperand(0), C);
+            return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
         } else if (Op0I->getOpcode() == Instruction::Or) {
@@ -4750,8 +4479,8 @@
         std::swap(A, B);
       if (B == Op1) {                                // (A|B)^B == A & ~B
         Instruction *NotB =
-          InsertNewInstBefore(BinaryOperator::createNot(Op1, "tmp"), I);
-        return BinaryOperator::createAnd(A, NotB);
+          InsertNewInstBefore(BinaryOperator::CreateNot(Op1, "tmp"), I);
+        return BinaryOperator::CreateAnd(A, NotB);
       }
     } else if (match(Op0I, m_Xor(m_Value(A), m_Value(B)))) {
       if (Op1 == A)                                          // (A^B)^A == B
@@ -4764,8 +4493,8 @@
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
           !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
         Instruction *N =
-          InsertNewInstBefore(BinaryOperator::createNot(A, "tmp"), I);
-        return BinaryOperator::createAnd(N, Op1);
+          InsertNewInstBefore(BinaryOperator::CreateNot(A, "tmp"), I);
+        return BinaryOperator::CreateAnd(N, Op1);
       }
     }
   }
@@ -4776,10 +4505,10 @@
       Op0I->getOperand(1) == Op1I->getOperand(1) &&
       (Op1I->hasOneUse() || Op1I->hasOneUse())) {
     Instruction *NewOp =
-      InsertNewInstBefore(BinaryOperator::createXor(Op0I->getOperand(0),
+      InsertNewInstBefore(BinaryOperator::CreateXor(Op0I->getOperand(0),
                                                     Op1I->getOperand(0),
                                                     Op0I->getName()), I);
-    return BinaryOperator::create(Op1I->getOpcode(), NewOp, 
+    return BinaryOperator::Create(Op1I->getOpcode(), NewOp, 
                                   Op1I->getOperand(1));
   }
     
@@ -4789,13 +4518,13 @@
     if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
         match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C)) 
-        return BinaryOperator::createXor(A, B);
+        return BinaryOperator::CreateXor(A, B);
     }
     // (A | B)^(A & B) -> A ^ B
     if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
         match(Op1I, m_And(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C)) 
-        return BinaryOperator::createXor(A, B);
+        return BinaryOperator::CreateXor(A, B);
     }
     
     // (A & B)^(C & D)
@@ -4815,8 +4544,8 @@
       
       if (X) {
         Instruction *NewOp =
-        InsertNewInstBefore(BinaryOperator::createXor(Y, Z, Op0->getName()), I);
-        return BinaryOperator::createAnd(NewOp, X);
+        InsertNewInstBefore(BinaryOperator::CreateXor(Y, Z, Op0->getName()), I);
+        return BinaryOperator::CreateAnd(NewOp, X);
       }
     }
   }
@@ -4837,14 +4566,15 @@
                               I.getType(), TD) &&
             ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
                               I.getType(), TD)) {
-          Instruction *NewOp = BinaryOperator::createXor(Op0C->getOperand(0),
+          Instruction *NewOp = BinaryOperator::CreateXor(Op0C->getOperand(0),
                                                          Op1C->getOperand(0),
                                                          I.getName());
           InsertNewInstBefore(NewOp, I);
-          return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
         }
       }
   }
+
   return Changed ? &I : 0;
 }
 
@@ -4873,11 +4603,12 @@
   Value *Result = Constant::getNullValue(IntPtrTy);
 
   // Build a mask for high order bits.
-  unsigned IntPtrWidth = TD.getPointerSize()*8;
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
   uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
 
-  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
-    Value *Op = GEP->getOperand(i);
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
     uint64_t Size = TD.getABITypeSize(GTI.getIndexedType()) & PtrSizeMask;
     if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
       if (OpC->isZero()) continue;
@@ -4890,7 +4621,7 @@
           Result = ConstantInt::get(RC->getValue() + APInt(IntPtrWidth, Size));
         else
           Result = IC.InsertNewInstBefore(
-                   BinaryOperator::createAdd(Result,
+                   BinaryOperator::CreateAdd(Result,
                                              ConstantInt::get(IntPtrTy, Size),
                                              GEP->getName()+".offs"), I);
         continue;
@@ -4904,7 +4635,7 @@
       else {
         // Emit an add instruction.
         Result = IC.InsertNewInstBefore(
-           BinaryOperator::createAdd(Result, Scale,
+           BinaryOperator::CreateAdd(Result, Scale,
                                      GEP->getName()+".offs"), I);
       }
       continue;
@@ -4922,7 +4653,7 @@
       if (Constant *OpC = dyn_cast<Constant>(Op))
         Op = ConstantExpr::getMul(OpC, Scale);
       else    // We'll let instcombine(mul) convert this to a shl if possible.
-        Op = IC.InsertNewInstBefore(BinaryOperator::createMul(Op, Scale,
+        Op = IC.InsertNewInstBefore(BinaryOperator::CreateMul(Op, Scale,
                                                   GEP->getName()+".idx"), I);
     }
 
@@ -4931,12 +4662,119 @@
       Result = ConstantExpr::getAdd(cast<Constant>(Op),
                                     cast<Constant>(Result));
     else
-      Result = IC.InsertNewInstBefore(BinaryOperator::createAdd(Op, Result,
+      Result = IC.InsertNewInstBefore(BinaryOperator::CreateAdd(Op, Result,
                                                   GEP->getName()+".offs"), I);
   }
   return Result;
 }
 
+
+/// EvaluateGEPOffsetExpression - Return an value that can be used to compare of
+/// the *offset* implied by GEP to zero.  For example, if we have &A[i], we want
+/// to return 'i' for "icmp ne i, 0".  Note that, in general, indices can be
+/// complex, and scales are involved.  The above expression would also be legal
+/// to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32).  This
+/// later form is less amenable to optimization though, and we are allowed to
+/// generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+/// 
+static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
+                                          InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+  
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return 0;
+  
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = TD.getABITypeSize(GTI.getIndexedType());
+  
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return 0;
+   
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+    
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+  
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth)
+      VariableIdx = new TruncInst(VariableIdx, TD.getIntPtrType(),
+                                  VariableIdx->getNameStart(), &I);
+    return VariableIdx;
+  }
+  
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size, so get it.
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+  
+  Offset &= PtrSizeMask;
+  VariableScale &= PtrSizeMask;
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return 0;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  const Type *IntPtrTy = TD.getIntPtrType();
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy,
+                                              true /*SExt*/, 
+                                              VariableIdx->getNameStart(), &I);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I);
+}
+
+
 /// FoldGEPICmp - Fold comparisons between a GEP instruction and something
 /// else.  At this point we know that the GEP is on the LHS of the comparison.
 Instruction *InstCombiner::FoldGEPICmp(User *GEPLHS, Value *RHS,
@@ -4944,15 +4782,20 @@
                                        Instruction &I) {
   assert(dyn_castGetElementPtr(GEPLHS) && "LHS is not a getelementptr!");
 
-  if (CastInst *CI = dyn_cast<CastInst>(RHS))
-    if (isa<PointerType>(CI->getOperand(0)->getType()))
-      RHS = CI->getOperand(0);
+  // Look through bitcasts.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
+    RHS = BCI->getOperand(0);
 
   Value *PtrBase = GEPLHS->getOperand(0);
   if (PtrBase == RHS) {
     // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
-    // This transformation is valid because we know pointers can't overflow.
-    Value *Offset = EmitGEPOffset(GEPLHS, I, *this);
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow.  See if we can output an optimized form.
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this);
+    
+    // If not, synthesize the offset the hard way.
+    if (Offset == 0)
+      Offset = EmitGEPOffset(GEPLHS, I, *this);
     return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
                         Constant::getNullValue(Offset->getType()));
   } else if (User *GEPRHS = dyn_castGetElementPtr(RHS)) {
@@ -5022,7 +4865,7 @@
       if (NumDifferences == 0)   // SAME GEP?
         return ReplaceInstUsesWith(I, // No comparison is needed here.
                                    ConstantInt::get(Type::Int1Ty,
-                                                    isTrueWhenEqual(Cond)));
+                                             ICmpInst::isTrueWhenEqual(Cond)));
 
       else if (NumDifferences == 1) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
@@ -5045,6 +4888,137 @@
   return 0;
 }
 
+/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
+///
+Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
+                                                Instruction *LHSI,
+                                                Constant *RHSC) {
+  if (!isa<ConstantFP>(RHSC)) return 0;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+  
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return 0;  // Unknown.
+  
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+  
+  // If this is a uitofp instruction, we need an extra bit to hold the sign.
+  if (isa<UIToFPInst>(LHSI))
+    ++InputSize;
+  
+  // If the conversion would lose info, don't hack on this.
+  if ((int)InputSize > MantissaWidth)
+    return 0;
+  
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+  
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: assert(0 && "Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ: Pred = ICmpInst::ICMP_EQ; break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT: Pred = ICmpInst::ICMP_SGT; break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE: Pred = ICmpInst::ICMP_SGE; break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT: Pred = ICmpInst::ICMP_SLT; break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE: Pred = ICmpInst::ICMP_SLE; break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE: Pred = ICmpInst::ICMP_NE; break;
+  case FCmpInst::FCMP_ORD:
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+  case FCmpInst::FCMP_UNO:
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+  }
+  
+  const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+  
+  // Now we know that the APFloat is a normal number, zero or inf.
+  
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getPrimitiveSizeInBits();
+  
+  // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+  // and large values. 
+  APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false);
+  SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                        APFloat::rmNearestTiesToEven);
+  if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+    if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT ||
+        Pred == ICmpInst::ICMP_SLE)
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+  }
+  
+  // See if the RHS value is < SignedMin.
+  APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+  SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                        APFloat::rmNearestTiesToEven);
+  if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+    if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+        Pred == ICmpInst::ICMP_SGE)
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] but
+  // it may still be fractional.  See if it is fractional by casting the FP
+  // value to the integer value and back, checking for equality.  Don't do this
+  // for zero, because -0.0 is not fractional.
+  Constant *RHSInt = ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero() &&
+      ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) != RHSC) {
+    // If we had a comparison against a fractional value, we have to adjust
+    // the compare predicate and sometimes the value.  RHSC is rounded towards
+    // zero at this point.
+    switch (Pred) {
+    default: assert(0 && "Unexpected integer comparison!");
+    case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+    case ICmpInst::ICMP_SLE:
+      // (float)int <= 4.4   --> int <= 4
+      // (float)int <= -4.4  --> int < -4
+      if (RHS.isNegative())
+        Pred = ICmpInst::ICMP_SLT;
+      break;
+    case ICmpInst::ICMP_SLT:
+      // (float)int < -4.4   --> int < -4
+      // (float)int < 4.4    --> int <= 4
+      if (!RHS.isNegative())
+        Pred = ICmpInst::ICMP_SLE;
+      break;
+    case ICmpInst::ICMP_SGT:
+      // (float)int > 4.4    --> int > 4
+      // (float)int > -4.4   --> int >= -4
+      if (RHS.isNegative())
+        Pred = ICmpInst::ICMP_SGE;
+      break;
+    case ICmpInst::ICMP_SGE:
+      // (float)int >= -4.4   --> int >= -4
+      // (float)int >= 4.4    --> int > 4
+      if (!RHS.isNegative())
+        Pred = ICmpInst::ICMP_SGT;
+      break;
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   bool Changed = SimplifyCompare(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -5093,10 +5067,31 @@
 
   // Handle fcmp with constant RHS
   if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    // If the constant is a nan, see if we can fold the comparison based on it.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->getValueAPF().isNaN()) {
+        if (FCmpInst::isOrdered(I.getPredicate()))   // True if ordered and...
+          return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        assert(FCmpInst::isUnordered(I.getPredicate()) &&
+               "Comparison must be either ordered or unordered!");
+        // True if unordered.
+        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+      }
+    }
+    
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
       switch (LHSI->getOpcode()) {
       case Instruction::PHI:
-        if (Instruction *NV = FoldOpIntoPhi(I))
+        // Only fold fcmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC))
           return NV;
         break;
       case Instruction::Select:
@@ -5139,7 +5134,7 @@
   // icmp X, X
   if (Op0 == Op1)
     return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
-                                                   isTrueWhenEqual(I)));
+                                                   I.isTrueWhenEqual()));
 
   if (isa<UndefValue>(Op1))                  // X icmp undef -> undef
     return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
@@ -5151,19 +5146,19 @@
       (isa<GlobalValue>(Op1) || isa<AllocaInst>(Op1) ||
        isa<ConstantPointerNull>(Op1)))
     return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
-                                                   !isTrueWhenEqual(I)));
+                                                   !I.isTrueWhenEqual()));
 
   // icmp's with boolean values can always be turned into bitwise operations
   if (Ty == Type::Int1Ty) {
     switch (I.getPredicate()) {
     default: assert(0 && "Invalid icmp instruction!");
     case ICmpInst::ICMP_EQ: {               // icmp eq bool %A, %B -> ~(A^B)
-      Instruction *Xor = BinaryOperator::createXor(Op0, Op1, I.getName()+"tmp");
+      Instruction *Xor = BinaryOperator::CreateXor(Op0, Op1, I.getName()+"tmp");
       InsertNewInstBefore(Xor, I);
-      return BinaryOperator::createNot(Xor);
+      return BinaryOperator::CreateNot(Xor);
     }
     case ICmpInst::ICMP_NE:                  // icmp eq bool %A, %B -> A^B
-      return BinaryOperator::createXor(Op0, Op1);
+      return BinaryOperator::CreateXor(Op0, Op1);
 
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_SGT:
@@ -5171,9 +5166,9 @@
       // FALL THROUGH
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_SLT: {               // icmp lt bool A, B -> ~X & Y
-      Instruction *Not = BinaryOperator::createNot(Op0, I.getName()+"tmp");
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
       InsertNewInstBefore(Not, I);
-      return BinaryOperator::createAnd(Not, Op1);
+      return BinaryOperator::CreateAnd(Not, Op1);
     }
     case ICmpInst::ICMP_UGE:
     case ICmpInst::ICMP_SGE:
@@ -5181,9 +5176,9 @@
       // FALL THROUGH
     case ICmpInst::ICMP_ULE:
     case ICmpInst::ICMP_SLE: {               //  icmp le bool %A, %B -> ~A | B
-      Instruction *Not = BinaryOperator::createNot(Op0, I.getName()+"tmp");
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
       InsertNewInstBefore(Not, I);
-      return BinaryOperator::createOr(Not, Op1);
+      return BinaryOperator::CreateOr(Not, Op1);
     }
     }
   }
@@ -5396,8 +5391,12 @@
         break;
 
       case Instruction::PHI:
-        if (Instruction *NV = FoldOpIntoPhi(I))
-          return NV;
+        // Only fold icmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
         break;
       case Instruction::Select: {
         // If either operand of the select is a constant, we can fold the
@@ -5432,7 +5431,7 @@
         if (LHSI->hasOneUse() && isa<ConstantPointerNull>(RHSC)) {
           AddToWorkList(LHSI);
           return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty,
-                                                         !isTrueWhenEqual(I)));
+                                                         !I.isTrueWhenEqual()));
         }
         break;
       }
@@ -5487,8 +5486,21 @@
         return R;
   }
   
+  // ~x < ~y --> y < x
+  { Value *A, *B;
+    if (match(Op0, m_Not(m_Value(A))) &&
+        match(Op1, m_Not(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, A);
+  }
+  
   if (I.isEquality()) {
     Value *A, *B, *C, *D;
+    
+    // -x == -y --> x == y
+    if (match(Op0, m_Neg(m_Value(A))) &&
+        match(Op1, m_Neg(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), A, B);
+    
     if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
       if (A == Op1 || B == Op1) {    // (A^B) == A  ->  B == 0
         Value *OtherVal = A == Op1 ? B : A;
@@ -5502,7 +5514,7 @@
           if (ConstantInt *C2 = dyn_cast<ConstantInt>(D))
             if (Op1->hasOneUse()) {
               Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
-              Instruction *Xor = BinaryOperator::createXor(C, NC, "tmp");
+              Instruction *Xor = BinaryOperator::CreateXor(C, NC, "tmp");
               return new ICmpInst(I.getPredicate(), A,
                                   InsertNewInstBefore(Xor, I));
             }
@@ -5550,8 +5562,8 @@
       }
       
       if (X) {   // Build (X^Y) & Z
-        Op1 = InsertNewInstBefore(BinaryOperator::createXor(X, Y, "tmp"), I);
-        Op1 = InsertNewInstBefore(BinaryOperator::createAnd(Op1, Z, "tmp"), I);
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateXor(X, Y, "tmp"), I);
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateAnd(Op1, Z, "tmp"), I);
         I.setOperand(0, Op1);
         I.setOperand(1, Constant::getNullValue(Op1->getType()));
         return &I;
@@ -5765,7 +5777,7 @@
           APInt NewCI = RHSV;
           NewCI.zext(BitWidth);
           Instruction *NewAnd = 
-            BinaryOperator::createAnd(Cast->getOperand(0),
+            BinaryOperator::CreateAnd(Cast->getOperand(0),
                                       ConstantInt::get(NewCST),LHSI->getName());
           InsertNewInstBefore(NewAnd, ICI);
           return new ICmpInst(ICI.getPredicate(), NewAnd,
@@ -5845,18 +5857,18 @@
         // Compute C << Y.
         Value *NS;
         if (Shift->getOpcode() == Instruction::LShr) {
-          NS = BinaryOperator::createShl(AndCST, 
+          NS = BinaryOperator::CreateShl(AndCST, 
                                          Shift->getOperand(1), "tmp");
         } else {
           // Insert a logical shift.
-          NS = BinaryOperator::createLShr(AndCST,
+          NS = BinaryOperator::CreateLShr(AndCST,
                                           Shift->getOperand(1), "tmp");
         }
         InsertNewInstBefore(cast<Instruction>(NS), ICI);
         
         // Compute X & (C << Y).
         Instruction *NewAnd = 
-          BinaryOperator::createAnd(Shift->getOperand(0), NS, LHSI->getName());
+          BinaryOperator::CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
         InsertNewInstBefore(NewAnd, ICI);
         
         ICI.setOperand(0, NewAnd);
@@ -5895,7 +5907,7 @@
           ConstantInt::get(APInt::getLowBitsSet(TypeBits, TypeBits-ShAmtVal));
         
         Instruction *AndI =
-          BinaryOperator::createAnd(LHSI->getOperand(0),
+          BinaryOperator::CreateAnd(LHSI->getOperand(0),
                                     Mask, LHSI->getName()+".mask");
         Value *And = InsertNewInstBefore(AndI, ICI);
         return new ICmpInst(ICI.getPredicate(), And,
@@ -5911,7 +5923,7 @@
       Constant *Mask = ConstantInt::get(APInt(TypeBits, 1) <<
                                            (TypeBits-ShAmt->getZExtValue()-1));
       Instruction *AndI =
-        BinaryOperator::createAnd(LHSI->getOperand(0),
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
                                   Mask, LHSI->getName()+".mask");
       Value *And = InsertNewInstBefore(AndI, ICI);
       
@@ -5953,19 +5965,20 @@
     // Otherwise, check to see if the bits shifted out are known to be zero.
     // If so, we can compare against the unshifted value:
     //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
-    if (MaskedValueIsZero(LHSI->getOperand(0), 
+    if (LHSI->hasOneUse() &&
+        MaskedValueIsZero(LHSI->getOperand(0), 
                           APInt::getLowBitsSet(Comp.getBitWidth(), ShAmtVal))) {
       return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                           ConstantExpr::getShl(RHS, ShAmt));
     }
       
-    if (LHSI->hasOneUse() || RHSV == 0) {
+    if (LHSI->hasOneUse()) {
       // Otherwise strength reduce the shift into an and.
       APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
       Constant *Mask = ConstantInt::get(Val);
       
       Instruction *AndI =
-        BinaryOperator::createAnd(LHSI->getOperand(0),
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
                                   Mask, LHSI->getName()+".mask");
       Value *And = InsertNewInstBefore(AndI, ICI);
       return new ICmpInst(ICI.getPredicate(), And,
@@ -6034,7 +6047,7 @@
           const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
           if (V.sgt(APInt(V.getBitWidth(), 1)) && V.isPowerOf2()) {
             Instruction *NewRem =
-              BinaryOperator::createURem(BO->getOperand(0), BO->getOperand(1),
+              BinaryOperator::CreateURem(BO->getOperand(0), BO->getOperand(1),
                                          BO->getName());
             InsertNewInstBefore(NewRem, ICI);
             return new ICmpInst(ICI.getPredicate(), NewRem, 
@@ -6058,7 +6071,7 @@
           else if (Value *NegVal = dyn_castNegVal(BOp0))
             return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
           else if (BO->hasOneUse()) {
-            Instruction *Neg = BinaryOperator::createNeg(BOp1);
+            Instruction *Neg = BinaryOperator::CreateNeg(BOp1);
             InsertNewInstBefore(Neg, ICI);
             Neg->takeName(BO);
             return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
@@ -6106,7 +6119,7 @@
                                 Constant::getNullValue(RHS->getType()));
           
           // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
-          if (isSignBit(BOC)) {
+          if (BOC->getValue().isSignBit()) {
             Value *X = BO->getOperand(0);
             Constant *Zero = Constant::getNullValue(X->getType());
             ICmpInst::Predicate pred = isICMP_NE ? 
@@ -6295,7 +6308,7 @@
     if (Constant *CI = dyn_cast<Constant>(Result))
       return ReplaceInstUsesWith(ICI, ConstantExpr::getNot(CI));
     else
-      return BinaryOperator::createNot(Result);
+      return BinaryOperator::CreateNot(Result);
   }
 }
 
@@ -6321,7 +6334,7 @@
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0, 
                       APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())))
-    return BinaryOperator::createLShr(Op0, I.getOperand(1));
+    return BinaryOperator::CreateLShr(Op0, I.getOperand(1));
   
   return 0;
 }
@@ -6389,7 +6402,7 @@
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
     if (BO->getOpcode() == Instruction::Mul && isLeftShift)
       if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
-        return BinaryOperator::createMul(BO->getOperand(0),
+        return BinaryOperator::CreateMul(BO->getOperand(0),
                                          ConstantExpr::getShl(BOOp, Op1));
   
   // Try to fold constant and into select arguments.
@@ -6412,7 +6425,7 @@
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
       Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
-      Instruction *NSh = BinaryOperator::create(I.getOpcode(), TrOp, ShAmt,
+      Instruction *NSh = BinaryOperator::Create(I.getOpcode(), TrOp, ShAmt,
                                                 I.getName());
       InsertNewInstBefore(NSh, I); // (shift2 (shift1 & 0x00FF), c2)
 
@@ -6435,7 +6448,7 @@
         MaskV = MaskV.lshr(Op1->getZExtValue());
       }
 
-      Instruction *And = BinaryOperator::createAnd(NSh, ConstantInt::get(MaskV),
+      Instruction *And = BinaryOperator::CreateAnd(NSh, ConstantInt::get(MaskV),
                                                    TI->getName());
       InsertNewInstBefore(And, I); // shift1 & 0x00FF
 
@@ -6460,16 +6473,16 @@
           if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
               match(Op0BO->getOperand(1),
                     m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
-            Instruction *YS = BinaryOperator::createShl(
+            Instruction *YS = BinaryOperator::CreateShl(
                                             Op0BO->getOperand(0), Op1,
                                             Op0BO->getName());
             InsertNewInstBefore(YS, I); // (Y << C)
             Instruction *X = 
-              BinaryOperator::create(Op0BO->getOpcode(), YS, V1,
+              BinaryOperator::Create(Op0BO->getOpcode(), YS, V1,
                                      Op0BO->getOperand(1)->getName());
             InsertNewInstBefore(X, I);  // (X + (Y << C))
             uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
-            return BinaryOperator::createAnd(X, ConstantInt::get(
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
                        APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
           }
           
@@ -6480,16 +6493,16 @@
                     m_And(m_Shr(m_Value(V1), m_Value(V2)),m_ConstantInt(CC))) &&
               cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse() &&
               V2 == Op1) {
-            Instruction *YS = BinaryOperator::createShl(
+            Instruction *YS = BinaryOperator::CreateShl(
                                                      Op0BO->getOperand(0), Op1,
                                                      Op0BO->getName());
             InsertNewInstBefore(YS, I); // (Y << C)
             Instruction *XM =
-              BinaryOperator::createAnd(V1, ConstantExpr::getShl(CC, Op1),
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
                                         V1->getName()+".mask");
             InsertNewInstBefore(XM, I); // X & (CC << C)
             
-            return BinaryOperator::create(Op0BO->getOpcode(), YS, XM);
+            return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
           }
         }
           
@@ -6499,16 +6512,16 @@
           if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
               match(Op0BO->getOperand(0),
                     m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
-            Instruction *YS = BinaryOperator::createShl(
+            Instruction *YS = BinaryOperator::CreateShl(
                                                      Op0BO->getOperand(1), Op1,
                                                      Op0BO->getName());
             InsertNewInstBefore(YS, I); // (Y << C)
             Instruction *X =
-              BinaryOperator::create(Op0BO->getOpcode(), V1, YS,
+              BinaryOperator::Create(Op0BO->getOpcode(), V1, YS,
                                      Op0BO->getOperand(0)->getName());
             InsertNewInstBefore(X, I);  // (X + (Y << C))
             uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
-            return BinaryOperator::createAnd(X, ConstantInt::get(
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
                        APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
           }
           
@@ -6519,16 +6532,16 @@
                           m_ConstantInt(CC))) && V2 == Op1 &&
               cast<BinaryOperator>(Op0BO->getOperand(0))
                   ->getOperand(0)->hasOneUse()) {
-            Instruction *YS = BinaryOperator::createShl(
+            Instruction *YS = BinaryOperator::CreateShl(
                                                      Op0BO->getOperand(1), Op1,
                                                      Op0BO->getName());
             InsertNewInstBefore(YS, I); // (Y << C)
             Instruction *XM =
-              BinaryOperator::createAnd(V1, ConstantExpr::getShl(CC, Op1),
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
                                         V1->getName()+".mask");
             InsertNewInstBefore(XM, I); // X & (CC << C)
             
-            return BinaryOperator::create(Op0BO->getOpcode(), XM, YS);
+            return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
           }
           
           break;
@@ -6569,11 +6582,11 @@
           Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
           
           Instruction *NewShift =
-            BinaryOperator::create(I.getOpcode(), Op0BO->getOperand(0), Op1);
+            BinaryOperator::Create(I.getOpcode(), Op0BO->getOperand(0), Op1);
           InsertNewInstBefore(NewShift, I);
           NewShift->takeName(Op0BO);
           
-          return BinaryOperator::create(Op0BO->getOpcode(), NewShift,
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
                                         NewRHS);
         }
       }
@@ -6601,21 +6614,21 @@
     
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
-      return BinaryOperator::create(I.getOpcode(), X,
+      return BinaryOperator::Create(I.getOpcode(), X,
                                     ConstantInt::get(Ty, AmtSum));
     } else if (ShiftOp->getOpcode() == Instruction::LShr &&
                I.getOpcode() == Instruction::AShr) {
       // ((X >>u C1) >>s C2) -> (X >>u (C1+C2))  since C1 != 0.
-      return BinaryOperator::createLShr(X, ConstantInt::get(Ty, AmtSum));
+      return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
     } else if (ShiftOp->getOpcode() == Instruction::AShr &&
                I.getOpcode() == Instruction::LShr) {
       // ((X >>s C1) >>u C2) -> ((X >>s (C1+C2)) & mask) since C1 != 0.
       Instruction *Shift =
-        BinaryOperator::createAShr(X, ConstantInt::get(Ty, AmtSum));
+        BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
       InsertNewInstBefore(Shift, I);
 
       APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-      return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+      return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
     }
     
     // Okay, if we get here, one shift must be left, and the other shift must be
@@ -6624,12 +6637,12 @@
       // If we have ((X >>? C) << C), turn this into X & (-1 << C).
       if (I.getOpcode() == Instruction::Shl) {
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::createAnd(X, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
       }
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
       if (I.getOpcode() == Instruction::LShr) {
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::createAnd(X, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
       }
       // We can simplify ((X << C) >>s C) into a trunc + sext.
       // NOTE: we could do this for any C, but that would make 'unusual' integer
@@ -6661,22 +6674,22 @@
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
         Instruction *Shift =
-          BinaryOperator::createShl(X, ConstantInt::get(Ty, ShiftDiff));
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
         InsertNewInstBefore(Shift, I);
         
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
       }
       
       // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr) {
         assert(ShiftOp->getOpcode() == Instruction::Shl);
         Instruction *Shift =
-          BinaryOperator::createLShr(X, ConstantInt::get(Ty, ShiftDiff));
+          BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, ShiftDiff));
         InsertNewInstBefore(Shift, I);
         
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
       }
       
       // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in.
@@ -6689,23 +6702,23 @@
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
         Instruction *Shift =
-          BinaryOperator::create(ShiftOp->getOpcode(), X,
+          BinaryOperator::Create(ShiftOp->getOpcode(), X,
                                  ConstantInt::get(Ty, ShiftDiff));
         InsertNewInstBefore(Shift, I);
         
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
       }
       
       // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr) {
         assert(ShiftOp->getOpcode() == Instruction::Shl);
         Instruction *Shift =
-          BinaryOperator::createShl(X, ConstantInt::get(Ty, ShiftDiff));
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
         InsertNewInstBefore(Shift, I);
         
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
       }
       
       // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in.
@@ -6820,14 +6833,14 @@
       Amt = Multiply(cast<ConstantInt>(NumElements), cast<ConstantInt>(Amt));
     // otherwise multiply the amount and the number of elements
     else if (Scale != 1) {
-      Instruction *Tmp = BinaryOperator::createMul(Amt, NumElements, "tmp");
+      Instruction *Tmp = BinaryOperator::CreateMul(Amt, NumElements, "tmp");
       Amt = InsertNewInstBefore(Tmp, AI);
     }
   }
   
   if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(Type::Int32Ty, Offset, true);
-    Instruction *Tmp = BinaryOperator::createAdd(Amt, Off, "tmp");
+    Instruction *Tmp = BinaryOperator::CreateAdd(Amt, Off, "tmp");
     Amt = InsertNewInstBefore(Tmp, AI);
   }
   
@@ -6861,6 +6874,16 @@
 ///
 /// This is a truncation operation if Ty is smaller than V->getType(), or an
 /// extension operation if Ty is larger.
+///
+/// If CastOpc is a truncation, then Ty will be a type smaller than V.  We
+/// should return true if trunc(V) can be computed by computing V in the smaller
+/// type.  If V is an instruction, then trunc(inst(x,y)) can be computed as
+/// inst(trunc(x),trunc(y)), which only makes sense if x and y can be
+/// efficiently truncated.
+///
+/// If CastOpc is a sext or zext, we are asking if the low bits of the value can
+/// bit computed in a larger type, which is then and'd or sext_in_reg'd to get
+/// the final result.
 bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
                                               unsigned CastOpc,
                                               int &NumCastsRemoved) {
@@ -6881,7 +6904,7 @@
       // If the first operand is itself a cast, and is eliminable, do not count
       // this as an eliminable cast.  We would prefer to eliminate those two
       // casts first.
-      if (!isa<CastInst>(I->getOperand(0)))
+      if (!isa<CastInst>(I->getOperand(0)) && I->hasOneUse())
         ++NumCastsRemoved;
       return true;
     }
@@ -6894,6 +6917,7 @@
   switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
+  case Instruction::Mul:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
@@ -6903,14 +6927,6 @@
            CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
                                       NumCastsRemoved);
 
-  case Instruction::Mul:
-    // A multiply can be truncated by truncating its operands.
-    return Ty->getBitWidth() < OrigTy->getBitWidth() && 
-           CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
-                                      NumCastsRemoved) &&
-           CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
-                                      NumCastsRemoved);
-
   case Instruction::Shl:
     // If we are truncating the result of this SHL, and if it's a shift of a
     // constant amount, we can always perform a SHL in a smaller type.
@@ -6946,8 +6962,23 @@
     // of casts in the input.
     if (I->getOpcode() == CastOpc)
       return true;
-    
     break;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateInDifferentType(SI->getTrueValue(), Ty, CastOpc,
+                                      NumCastsRemoved) &&
+           CanEvaluateInDifferentType(SI->getFalseValue(), Ty, CastOpc,
+                                      NumCastsRemoved);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateInDifferentType(PN->getIncomingValue(i), Ty, CastOpc,
+                                      NumCastsRemoved))
+        return false;
+    return true;
+  }
   default:
     // TODO: Can handle more cases here.
     break;
@@ -6979,8 +7010,8 @@
   case Instruction::Shl: {
     Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
     Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
-    Res = BinaryOperator::create((Instruction::BinaryOps)I->getOpcode(),
-                                 LHS, RHS, I->getName());
+    Res = BinaryOperator::Create((Instruction::BinaryOps)I->getOpcode(),
+                                 LHS, RHS);
     break;
   }    
   case Instruction::Trunc:
@@ -6992,16 +7023,33 @@
     if (I->getOperand(0)->getType() == Ty)
       return I->getOperand(0);
     
-    // Otherwise, must be the same type of case, so just reinsert a new one.
-    Res = CastInst::create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),
-                           Ty, I->getName());
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),
+                           Ty);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty);
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
     break;
+  }
   default: 
     // TODO: Can handle more cases here.
     assert(0 && "Unreachable!");
     break;
   }
   
+  Res->takeName(I);
   return InsertNewInstBefore(Res, *I);
 }
 
@@ -7016,7 +7064,7 @@
         isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
-      return CastInst::create(opc, CSrc->getOperand(0), CI.getType());
+      return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
     }
   }
 
@@ -7211,11 +7259,11 @@
         assert(SrcBitSize < DestBitSize && "Not a zext?");
         Constant *C = ConstantInt::get(APInt::getLowBitsSet(DestBitSize,
                                                             SrcBitSize));
-        return BinaryOperator::createAnd(Res, C);
+        return BinaryOperator::CreateAnd(Res, C);
       }
       case Instruction::SExt:
         // We need to emit a cast to truncate, then a cast to sext.
-        return CastInst::create(Instruction::SExt,
+        return CastInst::Create(Instruction::SExt,
             InsertCastBefore(Instruction::Trunc, Res, Src->getType(), 
                              CI), DestTy);
       }
@@ -7242,7 +7290,7 @@
         Instruction::CastOps opcode = CI.getOpcode();
         Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
         Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
-        return BinaryOperator::create(
+        return BinaryOperator::Create(
             cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
       }
     }
@@ -7253,7 +7301,7 @@
         Op1 == ConstantInt::getTrue() &&
         (!Op0->hasOneUse() || !isa<CmpInst>(Op0))) {
       Value *New = InsertOperandCastBefore(Instruction::ZExt, Op0, DestTy, &CI);
-      return BinaryOperator::createXor(New, ConstantInt::get(CI.getType(), 1));
+      return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
     }
     break;
   case Instruction::SDiv:
@@ -7271,7 +7319,7 @@
                                               Op0, DestTy, SrcI);
         Value *Op1c = InsertOperandCastBefore(Instruction::BitCast, 
                                               Op1, DestTy, SrcI);
-        return BinaryOperator::create(
+        return BinaryOperator::Create(
           cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
       }
     }
@@ -7289,7 +7337,7 @@
           Instruction::BitCast : Instruction::Trunc);
       Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
       Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
-      return BinaryOperator::createShl(Op0c, Op1c);
+      return BinaryOperator::CreateShl(Op0c, Op1c);
     }
     break;
   case Instruction::AShr:
@@ -7301,7 +7349,7 @@
       uint32_t ShiftAmt = cast<ConstantInt>(Op1)->getLimitedValue(SrcBitSize);
       if (SrcBitSize > ShiftAmt && SrcBitSize-ShiftAmt >= DestBitSize) {
         // Insert the new logical shift right.
-        return BinaryOperator::createLShr(Op0, Op1);
+        return BinaryOperator::CreateLShr(Op0, Op1);
       }
     }
     break;
@@ -7339,7 +7387,7 @@
           Value *V1 = InsertCastBefore(Instruction::Trunc, SrcIOp0, Ty, CI);
           Value *V2 = InsertCastBefore(Instruction::Trunc, SrcI->getOperand(1),
                                        Ty, CI);
-          return BinaryOperator::createLShr(V1, V2);
+          return BinaryOperator::CreateLShr(V1, V2);
         }
       } else {     // This is a variable shr.
         
@@ -7350,9 +7398,9 @@
           Value *One = ConstantInt::get(SrcI->getType(), 1);
 
           Value *V = InsertNewInstBefore(
-              BinaryOperator::createShl(One, SrcI->getOperand(1),
+              BinaryOperator::CreateShl(One, SrcI->getOperand(1),
                                      "tmp"), CI);
-          V = InsertNewInstBefore(BinaryOperator::createAnd(V,
+          V = InsertNewInstBefore(BinaryOperator::CreateAnd(V,
                                                             SrcI->getOperand(0),
                                                             "tmp"), CI);
           Value *Zero = Constant::getNullValue(V->getType());
@@ -7385,16 +7433,16 @@
       Value *In = ICI->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
                                    In->getType()->getPrimitiveSizeInBits()-1);
-      In = InsertNewInstBefore(BinaryOperator::createLShr(In, Sh,
+      In = InsertNewInstBefore(BinaryOperator::CreateLShr(In, Sh,
                                                         In->getName()+".lobit"),
                                CI);
       if (In->getType() != CI.getType())
-        In = CastInst::createIntegerCast(In, CI.getType(),
+        In = CastInst::CreateIntegerCast(In, CI.getType(),
                                          false/*ZExt*/, "tmp", &CI);
 
       if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
-        In = InsertNewInstBefore(BinaryOperator::createXor(In, One,
+        In = InsertNewInstBefore(BinaryOperator::CreateXor(In, One,
                                                          In->getName()+".not"),
                                  CI);
       }
@@ -7439,21 +7487,21 @@
         if (ShiftAmt) {
           // Perform a logical shr by shiftamt.
           // Insert the shift to put the result in the low bit.
-          In = InsertNewInstBefore(BinaryOperator::createLShr(In,
+          In = InsertNewInstBefore(BinaryOperator::CreateLShr(In,
                                   ConstantInt::get(In->getType(), ShiftAmt),
                                                    In->getName()+".lobit"), CI);
         }
           
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
-          In = BinaryOperator::createXor(In, One, "tmp");
+          In = BinaryOperator::CreateXor(In, One, "tmp");
           InsertNewInstBefore(cast<Instruction>(In), CI);
         }
           
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
         else
-          return CastInst::createIntegerCast(In, CI.getType(), false/*ZExt*/);
+          return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
       }
     }
   }
@@ -7485,12 +7533,12 @@
         APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
         Constant *AndConst = ConstantInt::get(AndValue);
         Instruction *And = 
-          BinaryOperator::createAnd(CSrc->getOperand(0), AndConst);
+          BinaryOperator::CreateAnd(CSrc->getOperand(0), AndConst);
         // Unfortunately, if the type changed, we need to cast it back.
         if (And->getType() != CI.getType()) {
           And->setName(CSrc->getName()+".mask");
           InsertNewInstBefore(And, CI);
-          And = CastInst::createIntegerCast(And, CI.getType(), false/*ZExt*/);
+          And = CastInst::CreateIntegerCast(And, CI.getType(), false/*ZExt*/);
         }
         return And;
       }
@@ -7511,7 +7559,7 @@
          transformZExtICmp(RHS, CI, false))) {
       Value *LCast = InsertCastBefore(Instruction::ZExt, LHS, CI.getType(), CI);
       Value *RCast = InsertCastBefore(Instruction::ZExt, RHS, CI.getType(), CI);
-      return BinaryOperator::create(Instruction::Or, LCast, RCast);
+      return BinaryOperator::Create(Instruction::Or, LCast, RCast);
     }
   }
 
@@ -7540,21 +7588,48 @@
         Value *In = ICI->getOperand(0);
         Value *Sh = ConstantInt::get(In->getType(),
                                      In->getType()->getPrimitiveSizeInBits()-1);
-        In = InsertNewInstBefore(BinaryOperator::createAShr(In, Sh,
+        In = InsertNewInstBefore(BinaryOperator::CreateAShr(In, Sh,
                                                         In->getName()+".lobit"),
                                  CI);
         if (In->getType() != CI.getType())
-          In = CastInst::createIntegerCast(In, CI.getType(),
+          In = CastInst::CreateIntegerCast(In, CI.getType(),
                                            true/*SExt*/, "tmp", &CI);
         
         if (ICI->getPredicate() == ICmpInst::ICMP_SGT)
-          In = InsertNewInstBefore(BinaryOperator::createNot(In,
+          In = InsertNewInstBefore(BinaryOperator::CreateNot(In,
                                      In->getName()+".not"), CI);
         
         return ReplaceInstUsesWith(CI, In);
       }
     }
   }
+
+  // See if the value being truncated is already sign extended.  If so, just
+  // eliminate the trunc/sext pair.
+  if (getOpcode(Src) == Instruction::Trunc) {
+    Value *Op = cast<User>(Src)->getOperand(0);
+    unsigned OpBits   = cast<IntegerType>(Op->getType())->getBitWidth();
+    unsigned MidBits  = cast<IntegerType>(Src->getType())->getBitWidth();
+    unsigned DestBits = cast<IntegerType>(CI.getType())->getBitWidth();
+    unsigned NumSignBits = ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return ReplaceInstUsesWith(CI, Op);
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new SExtInst(Op, CI.getType(), "tmp");
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new TruncInst(Op, CI.getType(), "tmp");
+    }
+  }
       
   return 0;
 }
@@ -7625,7 +7700,7 @@
                                       CI.getType(), CI);
           RHSTrunc = InsertCastBefore(Instruction::FPExt, RHSTrunc,
                                       CI.getType(), CI);
-          return BinaryOperator::create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
+          return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
         }
       }
       break;  
@@ -7638,12 +7713,30 @@
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitFPToUI(CastInst &CI) {
-  return commonCastTransforms(CI);
-}
-
-Instruction *InstCombiner::visitFPToSI(CastInst &CI) {
-  return commonCastTransforms(CI);
+Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+  // fptoui(uitofp(X)) --> X  if the intermediate type has enough bits in its
+  // mantissa to accurately represent all values of X.  For example, do not
+  // do this with i64->float->i64.
+  if (UIToFPInst *SrcI = dyn_cast<UIToFPInst>(FI.getOperand(0)))
+    if (SrcI->getOperand(0)->getType() == FI.getType() &&
+        (int)FI.getType()->getPrimitiveSizeInBits() < /*extra bit for sign */
+                    SrcI->getType()->getFPMantissaWidth())
+      return ReplaceInstUsesWith(FI, SrcI->getOperand(0));
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+  // fptosi(sitofp(X)) --> X  if the intermediate type has enough bits in its
+  // mantissa to accurately represent all values of X.  For example, do not
+  // do this with i64->float->i64.
+  if (SIToFPInst *SrcI = dyn_cast<SIToFPInst>(FI.getOperand(0)))
+    if (SrcI->getOperand(0)->getType() == FI.getType() &&
+        (int)FI.getType()->getPrimitiveSizeInBits() <= 
+                    SrcI->getType()->getFPMantissaWidth())
+      return ReplaceInstUsesWith(FI, SrcI->getOperand(0));
+  
+  return commonCastTransforms(FI);
 }
 
 Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
@@ -7868,7 +7961,7 @@
     SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0),
                                            FI->getOperand(0), SI.getName()+".v");
     InsertNewInstBefore(NewSI, SI);
-    return CastInst::create(Instruction::CastOps(TI->getOpcode()), NewSI, 
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, 
                             TI->getType());
   }
 
@@ -7912,9 +8005,9 @@
 
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
     if (MatchIsOpZero)
-      return BinaryOperator::create(BO->getOpcode(), MatchOp, NewSI);
+      return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI);
     else
-      return BinaryOperator::create(BO->getOpcode(), NewSI, MatchOp);
+      return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
   }
   assert(0 && "Shouldn't get here");
   return 0;
@@ -7949,33 +8042,33 @@
     if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
       if (C->getZExtValue()) {
         // Change: A = select B, true, C --> A = or B, C
-        return BinaryOperator::createOr(CondVal, FalseVal);
+        return BinaryOperator::CreateOr(CondVal, FalseVal);
       } else {
         // Change: A = select B, false, C --> A = and !B, C
         Value *NotCond =
-          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
                                              "not."+CondVal->getName()), SI);
-        return BinaryOperator::createAnd(NotCond, FalseVal);
+        return BinaryOperator::CreateAnd(NotCond, FalseVal);
       }
     } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
       if (C->getZExtValue() == false) {
         // Change: A = select B, C, false --> A = and B, C
-        return BinaryOperator::createAnd(CondVal, TrueVal);
+        return BinaryOperator::CreateAnd(CondVal, TrueVal);
       } else {
         // Change: A = select B, C, true --> A = or !B, C
         Value *NotCond =
-          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
                                              "not."+CondVal->getName()), SI);
-        return BinaryOperator::createOr(NotCond, TrueVal);
+        return BinaryOperator::CreateOr(NotCond, TrueVal);
       }
     }
     
     // select a, b, a  -> a&b
     // select a, a, b  -> a|b
     if (CondVal == TrueVal)
-      return BinaryOperator::createOr(CondVal, FalseVal);
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
     else if (CondVal == FalseVal)
-      return BinaryOperator::createAnd(CondVal, TrueVal);
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
   }
 
   // Selecting between two integer constants?
@@ -7983,13 +8076,13 @@
     if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
       // select C, 1, 0 -> zext C to int
       if (FalseValC->isZero() && TrueValC->getValue() == 1) {
-        return CastInst::create(Instruction::ZExt, CondVal, SI.getType());
+        return CastInst::Create(Instruction::ZExt, CondVal, SI.getType());
       } else if (TrueValC->isZero() && FalseValC->getValue() == 1) {
         // select C, 0, 1 -> zext !C to int
         Value *NotCond =
-          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
                                                "not."+CondVal->getName()), SI);
-        return CastInst::create(Instruction::ZExt, NotCond, SI.getType());
+        return CastInst::Create(Instruction::ZExt, NotCond, SI.getType());
       }
       
       // FIXME: Turn select 0/-1 and -1/0 into sext from condition!
@@ -8005,7 +8098,7 @@
               Value *X = IC->getOperand(0);
               uint32_t Bits = X->getType()->getPrimitiveSizeInBits();
               Constant *ShAmt = ConstantInt::get(X->getType(), Bits-1);
-              Instruction *SRA = BinaryOperator::create(Instruction::AShr, X,
+              Instruction *SRA = BinaryOperator::Create(Instruction::AShr, X,
                                                         ShAmt, "ones");
               InsertNewInstBefore(SRA, SI);
               
@@ -8018,7 +8111,7 @@
                 opc = Instruction::SExt;
               else if (SRASize > SISize)
                 opc = Instruction::Trunc;
-              return CastInst::create(opc, SRA, SI.getType());
+              return CastInst::Create(opc, SRA, SI.getType());
             }
           }
 
@@ -8043,7 +8136,7 @@
                 ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
                 Value *V = ICA;
                 if (ShouldNotVal)
-                  V = InsertNewInstBefore(BinaryOperator::create(
+                  V = InsertNewInstBefore(BinaryOperator::Create(
                                   Instruction::Xor, V, ICA->getOperand(1)), SI);
                 return ReplaceInstUsesWith(SI, V);
               }
@@ -8148,7 +8241,7 @@
               NegVal = ConstantExpr::getNeg(C);
             } else {
               NegVal = InsertNewInstBefore(
-                    BinaryOperator::createNeg(SubOp->getOperand(1), "tmp"), SI);
+                    BinaryOperator::CreateNeg(SubOp->getOperand(1), "tmp"), SI);
             }
 
             Value *NewTrueOp = OtherAddOp;
@@ -8156,10 +8249,11 @@
             if (AddOp != TI)
               std::swap(NewTrueOp, NewFalseOp);
             Instruction *NewSel =
-              SelectInst::Create(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
+              SelectInst::Create(CondVal, NewTrueOp,
+                                 NewFalseOp, SI.getName() + ".p");
 
             NewSel = InsertNewInstBefore(NewSel, SI);
-            return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
+            return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
           }
         }
       }
@@ -8182,11 +8276,12 @@
           if (OpToFold) {
             Constant *C = GetSelectFoldableConstant(TVI);
             Instruction *NewSel =
-              SelectInst::Create(SI.getCondition(), TVI->getOperand(2-OpToFold), C);
+              SelectInst::Create(SI.getCondition(),
+                                 TVI->getOperand(2-OpToFold), C);
             InsertNewInstBefore(NewSel, SI);
             NewSel->takeName(TVI);
             if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TVI))
-              return BinaryOperator::create(BO->getOpcode(), FalseVal, NewSel);
+              return BinaryOperator::Create(BO->getOpcode(), FalseVal, NewSel);
             else {
               assert(0 && "Unknown instruction!!");
             }
@@ -8207,11 +8302,12 @@
           if (OpToFold) {
             Constant *C = GetSelectFoldableConstant(FVI);
             Instruction *NewSel =
-              SelectInst::Create(SI.getCondition(), C, FVI->getOperand(2-OpToFold));
+              SelectInst::Create(SI.getCondition(), C,
+                                 FVI->getOperand(2-OpToFold));
             InsertNewInstBefore(NewSel, SI);
             NewSel->takeName(FVI);
             if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FVI))
-              return BinaryOperator::create(BO->getOpcode(), TrueVal, NewSel);
+              return BinaryOperator::Create(BO->getOpcode(), TrueVal, NewSel);
             else
               assert(0 && "Unknown instruction!!");
           }
@@ -8247,9 +8343,9 @@
   case Instruction::GetElementPtr: {
     // If all indexes are zero, it is just the alignment of the base pointer.
     bool AllZeroOperands = true;
-    for (unsigned i = 1, e = U->getNumOperands(); i != e; ++i)
-      if (!isa<Constant>(U->getOperand(i)) ||
-          !cast<Constant>(U->getOperand(i))->isNullValue()) {
+    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i)
+      if (!isa<Constant>(*i) ||
+          !cast<Constant>(*i)->isNullValue()) {
         AllZeroOperands = false;
         break;
       }
@@ -8323,7 +8419,9 @@
   // A single load+store correctly handles overlapping memory in the memmove
   // case.
   unsigned Size = MemOpLength->getZExtValue();
-  if (Size == 0 || Size > 8 || (Size&(Size-1)))
+  if (Size == 0) return MI;  // Delete this mem transfer.
+  
+  if (Size > 8 || (Size&(Size-1)))
     return 0;  // If not 1/2/4/8 bytes, exit.
   
   // Use an integer load+store unless we can find something better.
@@ -8340,7 +8438,7 @@
     if (SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
-      while (!SrcETy->isFirstClassType()) {
+      while (!SrcETy->isSingleValueType()) {
         if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
           if (STy->getNumElements() == 1)
             SrcETy = STy->getElementType(0);
@@ -8355,7 +8453,7 @@
           break;
       }
       
-      if (SrcETy->isFirstClassType())
+      if (SrcETy->isSingleValueType())
         NewPtrTy = PointerType::getUnqual(SrcETy);
     }
   }
@@ -8377,6 +8475,48 @@
   return MI;
 }
 
+Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+  unsigned Alignment = GetOrEnforceKnownAlignment(MI->getDest());
+  if (MI->getAlignment()->getZExtValue() < Alignment) {
+    MI->setAlignment(ConstantInt::get(Type::Int32Ty, Alignment));
+    return MI;
+  }
+  
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || FillC->getType() != Type::Int8Ty)
+    return 0;
+  uint64_t Len = LenC->getZExtValue();
+  Alignment = MI->getAlignment()->getZExtValue();
+  
+  // If the length is zero, this is a no-op
+  if (Len == 0) return MI; // memset(d,c,0,a) -> noop
+  
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    const Type *ITy = IntegerType::get(Len*8);  // n=1 -> i8.
+    
+    Value *Dest = MI->getDest();
+    Dest = InsertBitCastBefore(Dest, PointerType::getUnqual(ITy), *MI);
+
+    // Alignment 0 is identity for alignment 1 for memset, but not store.
+    if (Alignment == 0) Alignment = 1;
+    
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill), Dest, false,
+                                      Alignment), *MI);
+    
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return 0;
+}
+
+
 /// visitCallInst - CallInst simplification.  This mostly only handles folding 
 /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
 /// the heavy lifting.
@@ -8417,6 +8557,10 @@
           CI.setOperand(0, Intrinsic::getDeclaration(M, MemCpyID));
           Changed = true;
         }
+
+      // memmove(x,x,size) -> noop.
+      if (MMI->getSource() == MMI->getDest())
+        return EraseInstFromFunction(CI);
     }
 
     // If we can determine a pointer alignment that is bigger than currently
@@ -8424,152 +8568,161 @@
     if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
       if (Instruction *I = SimplifyMemTransfer(MI))
         return I;
-    } else if (isa<MemSetInst>(MI)) {
-      unsigned Alignment = GetOrEnforceKnownAlignment(MI->getDest());
-      if (MI->getAlignment()->getZExtValue() < Alignment) {
-        MI->setAlignment(ConstantInt::get(Type::Int32Ty, Alignment));
-        Changed = true;
-      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
+      if (Instruction *I = SimplifyMemSet(MSI))
+        return I;
     }
           
     if (Changed) return II;
-  } else {
-    switch (II->getIntrinsicID()) {
-    default: break;
-    case Intrinsic::ppc_altivec_lvx:
-    case Intrinsic::ppc_altivec_lvxl:
-    case Intrinsic::x86_sse_loadu_ps:
-    case Intrinsic::x86_sse2_loadu_pd:
-    case Intrinsic::x86_sse2_loadu_dq:
-      // Turn PPC lvx     -> load if the pointer is known aligned.
-      // Turn X86 loadups -> load if the pointer is known aligned.
-      if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
-        Value *Ptr = InsertBitCastBefore(II->getOperand(1),
-                                         PointerType::getUnqual(II->getType()),
-                                         CI);
-        return new LoadInst(Ptr);
-      }
-      break;
-    case Intrinsic::ppc_altivec_stvx:
-    case Intrinsic::ppc_altivec_stvxl:
-      // Turn stvx -> store if the pointer is known aligned.
-      if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) {
-        const Type *OpPtrTy = 
-          PointerType::getUnqual(II->getOperand(1)->getType());
-        Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI);
-        return new StoreInst(II->getOperand(1), Ptr);
-      }
-      break;
-    case Intrinsic::x86_sse_storeu_ps:
-    case Intrinsic::x86_sse2_storeu_pd:
-    case Intrinsic::x86_sse2_storeu_dq:
-    case Intrinsic::x86_sse2_storel_dq:
-      // Turn X86 storeu -> store if the pointer is known aligned.
-      if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
-        const Type *OpPtrTy = 
-          PointerType::getUnqual(II->getOperand(2)->getType());
-        Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI);
-        return new StoreInst(II->getOperand(2), Ptr);
-      }
-      break;
-      
-    case Intrinsic::x86_sse_cvttss2si: {
-      // These intrinsics only demands the 0th element of its input vector.  If
-      // we can simplify the input based on that, do so now.
-      uint64_t UndefElts;
-      if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
-                                                UndefElts)) {
-        II->setOperand(1, V);
-        return II;
+  }
+  
+  switch (II->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::bswap:
+    // bswap(bswap(x)) -> x
+    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getOperand(1)))
+      if (Operand->getIntrinsicID() == Intrinsic::bswap)
+        return ReplaceInstUsesWith(CI, Operand->getOperand(1));
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::x86_sse_loadu_ps:
+  case Intrinsic::x86_sse2_loadu_pd:
+  case Intrinsic::x86_sse2_loadu_dq:
+    // Turn PPC lvx     -> load if the pointer is known aligned.
+    // Turn X86 loadups -> load if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1),
+                                       PointerType::getUnqual(II->getType()),
+                                       CI);
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(1)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(1), Ptr);
+    }
+    break;
+  case Intrinsic::x86_sse_storeu_ps:
+  case Intrinsic::x86_sse2_storeu_pd:
+  case Intrinsic::x86_sse2_storeu_dq:
+  case Intrinsic::x86_sse2_storel_dq:
+    // Turn X86 storeu -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(2)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(2), Ptr);
+    }
+    break;
+    
+  case Intrinsic::x86_sse_cvttss2si: {
+    // These intrinsics only demands the 0th element of its input vector.  If
+    // we can simplify the input based on that, do so now.
+    uint64_t UndefElts;
+    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
+                                              UndefElts)) {
+      II->setOperand(1, V);
+      return II;
+    }
+    break;
+  }
+    
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
+      assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+      
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+            !isa<UndefValue>(Mask->getOperand(i))) {
+          AllEltsOk = false;
+          break;
+        }
       }
-      break;
-    }
       
-    case Intrinsic::ppc_altivec_vperm:
-      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
-      if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
-        assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI);
+        Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI);
+        Value *Result = UndefValue::get(Op0->getType());
+        
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
         
-        // Check that all of the elements are integer constants or undefs.
-        bool AllEltsOk = true;
         for (unsigned i = 0; i != 16; ++i) {
-          if (!isa<ConstantInt>(Mask->getOperand(i)) && 
-              !isa<UndefValue>(Mask->getOperand(i))) {
-            AllEltsOk = false;
-            break;
-          }
-        }
-        
-        if (AllEltsOk) {
-          // Cast the input vectors to byte vectors.
-          Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI);
-          Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI);
-          Value *Result = UndefValue::get(Op0->getType());
-          
-          // Only extract each element once.
-          Value *ExtractedElts[32];
-          memset(ExtractedElts, 0, sizeof(ExtractedElts));
-          
-          for (unsigned i = 0; i != 16; ++i) {
-            if (isa<UndefValue>(Mask->getOperand(i)))
-              continue;
-            unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
-            Idx &= 31;  // Match the hardware behavior.
-            
-            if (ExtractedElts[Idx] == 0) {
-              Instruction *Elt = 
-                new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
-              InsertNewInstBefore(Elt, CI);
-              ExtractedElts[Idx] = Elt;
-            }
+          if (isa<UndefValue>(Mask->getOperand(i)))
+            continue;
+          unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
+          Idx &= 31;  // Match the hardware behavior.
           
-            // Insert this value into the result vector.
-            Result = InsertElementInst::Create(Result, ExtractedElts[Idx], i, "tmp");
-            InsertNewInstBefore(cast<Instruction>(Result), CI);
+          if (ExtractedElts[Idx] == 0) {
+            Instruction *Elt = 
+              new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
+            InsertNewInstBefore(Elt, CI);
+            ExtractedElts[Idx] = Elt;
           }
-          return CastInst::create(Instruction::BitCast, Result, CI.getType());
+        
+          // Insert this value into the result vector.
+          Result = InsertElementInst::Create(Result, ExtractedElts[Idx],
+                                             i, "tmp");
+          InsertNewInstBefore(cast<Instruction>(Result), CI);
         }
+        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
       }
-      break;
+    }
+    break;
 
-    case Intrinsic::stackrestore: {
-      // If the save is right next to the restore, remove the restore.  This can
-      // happen when variable allocas are DCE'd.
-      if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
-        if (SS->getIntrinsicID() == Intrinsic::stacksave) {
-          BasicBlock::iterator BI = SS;
-          if (&*++BI == II)
-            return EraseInstFromFunction(CI);
-        }
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        BasicBlock::iterator BI = SS;
+        if (&*++BI == II)
+          return EraseInstFromFunction(CI);
       }
-      
-      // Scan down this block to see if there is another stack restore in the
-      // same block without an intervening call/alloca.
-      BasicBlock::iterator BI = II;
-      TerminatorInst *TI = II->getParent()->getTerminator();
-      bool CannotRemove = false;
-      for (++BI; &*BI != TI; ++BI) {
-        if (isa<AllocaInst>(BI)) {
+    }
+    
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI = II;
+    TerminatorInst *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return EraseInstFromFunction(CI);
+          // Otherwise, ignore the intrinsic.
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
           CannotRemove = true;
           break;
         }
-        if (isa<CallInst>(BI)) {
-          if (!isa<IntrinsicInst>(BI)) {
-            CannotRemove = true;
-            break;
-          }
-          // If there is a stackrestore below this one, remove this one.
-          return EraseInstFromFunction(CI);
-        }
       }
-      
-      // If the stack restore is in a return/unwind block and if there are no
-      // allocas or calls between the restore and the return, nuke the restore.
-      if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
-        return EraseInstFromFunction(CI);
-      break;
-    }
     }
+    
+    // If the stack restore is in a return/unwind block and if there are no
+    // allocas or calls between the restore and the return, nuke the restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
+      return EraseInstFromFunction(CI);
+    break;
+  }
   }
 
   return visitCallSite(II);
@@ -8581,6 +8734,31 @@
   return visitCallSite(&II);
 }
 
+/// isSafeToEliminateVarargsCast - If this cast does not affect the value 
+/// passed through the varargs area, we can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallSite CS,
+                                         const CastInst * const CI,
+                                         const TargetData * const TD,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // The size of ByVal arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!CS.paramHasAttr(ix, ParamAttr::ByVal))
+    return true;
+
+  const Type* SrcTy = 
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (TD->getABITypeSize(SrcTy) != TD->getABITypeSize(DstTy))
+    return false;
+  return true;
+}
+
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
@@ -8635,19 +8813,17 @@
   const PointerType *PTy = cast<PointerType>(Callee->getType());
   const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
     // See if we can optimize any arguments passed through the varargs area of
     // the call.
     for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
-           E = CS.arg_end(); I != E; ++I)
-      if (CastInst *CI = dyn_cast<CastInst>(*I)) {
-        // If this cast does not effect the value passed through the varargs
-        // area, we can eliminate the use of the cast.
-        Value *Op = CI->getOperand(0);
-        if (CI->isLosslessCast()) {
-          *I = Op;
-          Changed = true;
-        }
+           E = CS.arg_end(); I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
+        *I = CI->getOperand(0);
+        Changed = true;
       }
+    }
   }
 
   if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
@@ -8678,27 +8854,28 @@
   //
   const FunctionType *FT = Callee->getFunctionType();
   const Type *OldRetTy = Caller->getType();
+  const Type *NewRetTy = FT->getReturnType();
 
-  if (isa<StructType>(FT->getReturnType()))
+  if (isa<StructType>(NewRetTy))
     return false; // TODO: Handle multiple return values.
 
   // Check to see if we are changing the return type...
-  if (OldRetTy != FT->getReturnType()) {
-    if (Callee->isDeclaration() && !Caller->use_empty() && 
-        // Conversion is ok if changing from pointer to int of same size.
-        !(isa<PointerType>(FT->getReturnType()) &&
-          TD->getIntPtrType() == OldRetTy))
+  if (OldRetTy != NewRetTy) {
+    if (Callee->isDeclaration() &&
+        // Conversion is ok if changing from one pointer type to another or from
+        // a pointer to an integer of the same size.
+        !((isa<PointerType>(OldRetTy) || OldRetTy == TD->getIntPtrType()) &&
+          (isa<PointerType>(NewRetTy) || NewRetTy == TD->getIntPtrType())))
       return false;   // Cannot transform this return value.
 
     if (!Caller->use_empty() &&
         // void -> non-void is handled specially
-        FT->getReturnType() != Type::VoidTy &&
-        !CastInst::isCastable(FT->getReturnType(), OldRetTy))
+        NewRetTy != Type::VoidTy && !CastInst::isCastable(NewRetTy, OldRetTy))
       return false;   // Cannot transform this return value.
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
       ParameterAttributes RAttrs = CallerPAL.getParamAttrs(0);
-      if (RAttrs & ParamAttr::typeIncompatible(FT->getReturnType()))
+      if (RAttrs & ParamAttr::typeIncompatible(NewRetTy))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -8730,15 +8907,11 @@
     if (CallerPAL.getParamAttrs(i + 1) & ParamAttr::typeIncompatible(ParamTy))
       return false;   // Attribute not compatible with transformed value.
 
-    ConstantInt *c = dyn_cast<ConstantInt>(*AI);
-    // Some conversions are safe even if we do not have a body.
-    // Either we can cast directly, or we can upconvert the argument
+    // Converting from one pointer type to another or between a pointer and an
+    // integer of the same size is safe even if we do not have a body.
     bool isConvertible = ActTy == ParamTy ||
-      (isa<PointerType>(ParamTy) && isa<PointerType>(ActTy)) ||
-      (ParamTy->isInteger() && ActTy->isInteger() &&
-       ParamTy->getPrimitiveSizeInBits() >= ActTy->getPrimitiveSizeInBits()) ||
-      (c && ParamTy->getPrimitiveSizeInBits() >= ActTy->getPrimitiveSizeInBits()
-       && c->getValue().isStrictlyPositive());
+      ((isa<PointerType>(ParamTy) || ParamTy == TD->getIntPtrType()) &&
+       (isa<PointerType>(ActTy) || ActTy == TD->getIntPtrType()));
     if (Callee->isDeclaration() && !isConvertible) return false;
   }
 
@@ -8771,7 +8944,7 @@
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs &= ~ParamAttr::typeIncompatible(FT->getReturnType());
+  RAttrs &= ~ParamAttr::typeIncompatible(NewRetTy);
 
   // Add the new return attributes.
   if (RAttrs)
@@ -8785,7 +8958,7 @@
     } else {
       Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
           false, ParamTy, false);
-      CastInst *NewCast = CastInst::create(opcode, *AI, ParamTy, "tmp");
+      CastInst *NewCast = CastInst::Create(opcode, *AI, ParamTy, "tmp");
       Args.push_back(InsertNewInstBefore(NewCast, *Caller));
     }
 
@@ -8812,7 +8985,7 @@
           // Must promote to pass through va_arg area!
           Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, false, 
                                                                 PTy, false);
-          Instruction *Cast = CastInst::create(opcode, *AI, PTy, "tmp");
+          Instruction *Cast = CastInst::Create(opcode, *AI, PTy, "tmp");
           InsertNewInstBefore(Cast, *Caller);
           Args.push_back(Cast);
         } else {
@@ -8826,7 +8999,7 @@
     }
   }
 
-  if (FT->getReturnType() == Type::VoidTy)
+  if (NewRetTy == Type::VoidTy)
     Caller->setName("");   // Void type should not have a name.
 
   const PAListPtr &NewCallerPAL = PAListPtr::get(attrVec.begin(),attrVec.end());
@@ -8834,7 +9007,8 @@
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
     NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(),
-                            Args.begin(), Args.end(), Caller->getName(), Caller);
+                            Args.begin(), Args.end(),
+                            Caller->getName(), Caller);
     cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
     cast<InvokeInst>(NC)->setParamAttrs(NewCallerPAL);
   } else {
@@ -8853,13 +9027,12 @@
     if (NV->getType() != Type::VoidTy) {
       Instruction::CastOps opcode = CastInst::getCastOpcode(NC, false, 
                                                             OldRetTy, false);
-      NV = NC = CastInst::create(opcode, NC, OldRetTy, "tmp");
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
 
       // If this is an invoke instruction, we should insert it after the first
       // non-phi, instruction in the normal successor block.
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-        BasicBlock::iterator I = II->getNormalDest()->begin();
-        while (isa<PHINode>(I)) ++I;
+        BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI();
         InsertNewInstBefore(NC, *I);
       } else {
         // Otherwise, it's a call, just insert cast right after the call instr
@@ -8895,8 +9068,7 @@
   IntrinsicInst *Tramp =
     cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
 
-  Function *NestF =
-    cast<Function>(IntrinsicInst::StripPointerCasts(Tramp->getOperand(2)));
+  Function *NestF = cast<Function>(Tramp->getOperand(2)->stripPointerCasts());
   const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
   const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
@@ -9076,7 +9248,8 @@
   Value *InRHS = FirstInst->getOperand(1);
   PHINode *NewLHS = 0, *NewRHS = 0;
   if (LHSVal == 0) {
-    NewLHS = PHINode::Create(LHSType, FirstInst->getOperand(0)->getName()+".pn");
+    NewLHS = PHINode::Create(LHSType,
+                             FirstInst->getOperand(0)->getName() + ".pn");
     NewLHS->reserveOperandSpace(PN.getNumOperands()/2);
     NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
     InsertNewInstBefore(NewLHS, PN);
@@ -9084,7 +9257,8 @@
   }
   
   if (RHSVal == 0) {
-    NewRHS = PHINode::Create(RHSType, FirstInst->getOperand(1)->getName()+".pn");
+    NewRHS = PHINode::Create(RHSType,
+                             FirstInst->getOperand(1)->getName() + ".pn");
     NewRHS->reserveOperandSpace(PN.getNumOperands()/2);
     NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
     InsertNewInstBefore(NewRHS, PN);
@@ -9104,9 +9278,9 @@
   }
     
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
-    return BinaryOperator::create(BinOp->getOpcode(), LHSVal, RHSVal);
+    return BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
   else if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
-    return CmpInst::create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal, 
+    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal, 
                            RHSVal);
   else {
     assert(isa<GetElementPtrInst>(FirstInst));
@@ -9205,6 +9379,15 @@
           LI->getParent() != PN.getIncomingBlock(i) ||
           !isSafeToSinkLoad(LI))
         return 0;
+      
+      // If the PHI is volatile and its block has multiple successors, sinking
+      // it would remove a load of the volatile value from the path through the
+      // other successor.
+      if (isVolatile &&
+          LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+        return 0;
+
+      
     } else if (I->getOperand(1) != ConstantOp) {
       return 0;
     }
@@ -9240,17 +9423,22 @@
 
   // Insert and return the new operation.
   if (CastInst* FirstCI = dyn_cast<CastInst>(FirstInst))
-    return CastInst::create(FirstCI->getOpcode(), PhiVal, PN.getType());
-  else if (isa<LoadInst>(FirstInst))
-    return new LoadInst(PhiVal, "", isVolatile);
-  else if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
-    return BinaryOperator::create(BinOp->getOpcode(), PhiVal, ConstantOp);
-  else if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
-    return CmpInst::create(CIOp->getOpcode(), CIOp->getPredicate(), 
+    return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
+    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), 
                            PhiVal, ConstantOp);
-  else
-    assert(0 && "Unknown operation");
-  return 0;
+  assert(isa<LoadInst>(FirstInst) && "Unknown operation");
+  
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+      cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
+  
+  return new LoadInst(PhiVal, "", isVolatile);
 }
 
 /// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
@@ -9416,9 +9604,10 @@
   bool MadeChange = false;
   
   gep_type_iterator GTI = gep_type_begin(GEP);
-  for (unsigned i = 1, e = GEP.getNumOperands(); i != e; ++i, ++GTI) {
+  for (User::op_iterator i = GEP.op_begin() + 1, e = GEP.op_end();
+       i != e; ++i, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
-      if (CastInst *CI = dyn_cast<CastInst>(GEP.getOperand(i))) {
+      if (CastInst *CI = dyn_cast<CastInst>(*i)) {
         if (CI->getOpcode() == Instruction::ZExt ||
             CI->getOpcode() == Instruction::SExt) {
           const Type *SrcTy = CI->getOperand(0)->getType();
@@ -9426,7 +9615,7 @@
           // is a 32-bit pointer target.
           if (SrcTy->getPrimitiveSizeInBits() >= TD->getPointerSizeInBits()) {
             MadeChange = true;
-            GEP.setOperand(i, CI->getOperand(0));
+            *i = CI->getOperand(0);
           }
         }
       }
@@ -9434,15 +9623,15 @@
       // to what we need.  If the incoming value needs a cast instruction,
       // insert it.  This explicit cast can make subsequent optimizations more
       // obvious.
-      Value *Op = GEP.getOperand(i);
+      Value *Op = *i;
       if (TD->getTypeSizeInBits(Op->getType()) > TD->getPointerSizeInBits()) {
         if (Constant *C = dyn_cast<Constant>(Op)) {
-          GEP.setOperand(i, ConstantExpr::getTrunc(C, TD->getIntPtrType()));
+          *i = ConstantExpr::getTrunc(C, TD->getIntPtrType());
           MadeChange = true;
         } else {
           Op = InsertCastBefore(Instruction::Trunc, Op, TD->getIntPtrType(),
                                 GEP);
-          GEP.setOperand(i, Op);
+          *i = Op;
           MadeChange = true;
         }
       }
@@ -9534,7 +9723,7 @@
         if (isa<Constant>(SO1) && isa<Constant>(GO1))
           Sum = ConstantExpr::getAdd(cast<Constant>(SO1), cast<Constant>(GO1));
         else {
-          Sum = BinaryOperator::createAdd(SO1, GO1, PtrOp->getName()+".sum");
+          Sum = BinaryOperator::CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
           InsertNewInstBefore(cast<Instruction>(Sum), GEP);
         }
       }
@@ -9665,7 +9854,7 @@
           if (Scale->getZExtValue() != 1) {
             Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(),
                                                        false /*ZExt*/);
-            Instruction *Sc = BinaryOperator::createMul(NewIdx, C, "idxscale");
+            Instruction *Sc = BinaryOperator::CreateMul(NewIdx, C, "idxscale");
             NewIdx = InsertNewInstBefore(Sc, GEP);
           }
 
@@ -9789,8 +9978,8 @@
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CI)) {
     // Instead of loading constant c string, use corresponding integer value
     // directly if string length is small enough.
-    const std::string &Str = CE->getOperand(0)->getStringValue();
-    if (!Str.empty()) {
+    std::string Str;
+    if (GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
       unsigned len = Str.length();
       const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
       unsigned numBits = Ty->getPrimitiveSizeInBits();
@@ -9882,10 +10071,16 @@
   while (BBI != E) {
     --BBI;
 
+    // If we see a free or a call (which might do a free) the pointer could be
+    // marked invalid.
+    if (isa<FreeInst>(BBI) || isa<CallInst>(BBI))
+      return false;
+    
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       if (LI->getOperand(0) == V) return true;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
       if (SI->getOperand(1) == V) return true;
+    }
 
   }
   return false;
@@ -10099,7 +10294,7 @@
           NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
         else
           NewCast = IC.InsertNewInstBefore(
-            CastInst::create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
+            CastInst::Create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
             SI);
         return new StoreInst(NewCast, CastOp);
       }
@@ -10120,7 +10315,7 @@
   
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
-  if (Ptr->hasOneUse()) {
+  if (Ptr->hasOneUse() && !SI.isVolatile()) {
     if (isa<AllocaInst>(Ptr)) {
       EraseInstFromFunction(SI);
       ++NumCombined;
@@ -10177,7 +10372,7 @@
     }
     
     // Don't skip over loads or things that can modify memory.
-    if (BBI->mayWriteToMemory())
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
   }
   
@@ -10257,8 +10452,12 @@
   }
   if (++PI != pred_end(DestBB))
     return false;
-  
-  
+
+  // Bail out if all the relevant blocks aren't distinct (this can happen,
+  // for example, if SI is in an infinite loop)
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
   // Verify that the other block ends in a branch and is not otherwise empty.
   BasicBlock::iterator BBI = OtherBB->getTerminator();
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
@@ -10291,18 +10490,19 @@
           return false;
         break;
       }
-      // If we find something that may be using the stored value, or if we run
-      // out of instructions, we can't do the xform.
-      if (isa<LoadInst>(BBI) || BBI->mayWriteToMemory() ||
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the xform.
+      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
           BBI == OtherBB->begin())
         return false;
     }
     
     // In order to eliminate the store in OtherBr, we have to
-    // make sure nothing reads the stored value in StoreBB.
+    // make sure nothing reads or overwrites the stored value in
+    // StoreBB.
     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
       // FIXME: This should really be AA driven.
-      if (isa<LoadInst>(I) || I->mayWriteToMemory())
+      if (I->mayReadFromMemory() || I->mayWriteToMemory())
         return false;
     }
   }
@@ -10319,8 +10519,7 @@
   
   // Advance to a place where it is safe to insert the new store and
   // insert it.
-  BBI = DestBB->begin();
-  while (isa<PHINode>(BBI)) ++BBI;
+  BBI = DestBB->getFirstNonPHI();
   InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
                                     OtherStore->isVolatile()), *BBI);
   
@@ -10407,6 +10606,16 @@
   return 0;
 }
 
+Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+  // See if we are trying to extract a known value. If so, use that instead.
+  if (Value *Elt = FindInsertedValue(EV.getOperand(0), EV.idx_begin(),
+                                     EV.idx_end(), &EV))
+    return ReplaceInstUsesWith(EV, Elt);
+
+  // No changes
+  return 0;
+}
+
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
 /// is to leave as a vector operation.
 static bool CheapToScalarize(Value *V, bool isConstant) {
@@ -10458,11 +10667,11 @@
 
   std::vector<unsigned> Result;
   const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2));
-  for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
-    if (isa<UndefValue>(CP->getOperand(i)))
+  for (User::const_op_iterator i = CP->op_begin(), e = CP->op_end(); i!=e; ++i)
+    if (isa<UndefValue>(*i))
       Result.push_back(NElts*2);  // undef -> 8
     else
-      Result.push_back(cast<ConstantInt>(CP->getOperand(i))->getZExtValue());
+      Result.push_back(cast<ConstantInt>(*i)->getZExtValue());
   return Result;
 }
 
@@ -10511,7 +10720,6 @@
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-
   // If vector val is undef, replace extract with scalar undef.
   if (isa<UndefValue>(EI.getOperand(0)))
     return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
@@ -10521,8 +10729,9 @@
     return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
   
   if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) {
-    // If vector val is constant with uniform operands, replace EI
-    // with that operand
+    // If vector val is constant with all elements the same, replace EI with
+    // that element. When the elements are not identical, we cannot replace yet
+    // (we do that below, but only when the index is constant).
     Constant *op0 = C->getOperand(0);
     for (unsigned i = 1; i < C->getNumOperands(); ++i)
       if (C->getOperand(i) != op0) {
@@ -10588,15 +10797,15 @@
                                    EI.getName()+".rhs");
           InsertNewInstBefore(newEI0, EI);
           InsertNewInstBefore(newEI1, EI);
-          return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
+          return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1);
         }
       } else if (isa<LoadInst>(I)) {
         unsigned AS = 
           cast<PointerType>(I->getOperand(0)->getType())->getAddressSpace();
         Value *Ptr = InsertBitCastBefore(I->getOperand(0),
                                          PointerType::get(EI.getType(), AS),EI);
-        GetElementPtrInst *GEP = 
-          GetElementPtrInst::Create(Ptr, EI.getOperand(1), I->getName() + ".gep");
+        GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(Ptr, EI.getOperand(1), I->getName()+".gep");
         InsertNewInstBefore(GEP, EI);
         return new LoadInst(GEP);
       }
@@ -10971,7 +11180,8 @@
   assert(I->hasOneUse() && "Invariants didn't hold!");
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-  if (isa<PHINode>(I) || I->mayWriteToMemory()) return false;
+  if (isa<PHINode>(I) || I->mayWriteToMemory() || isa<TerminatorInst>(I))
+    return false;
 
   // Do not sink alloca instructions out of the entry block.
   if (isa<AllocaInst>(I) && I->getParent() ==
@@ -10980,15 +11190,14 @@
 
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    for (BasicBlock::iterator Scan = LI, E = LI->getParent()->end();
+  if (I->mayReadFromMemory()) {
+    for (BasicBlock::iterator Scan = I, E = I->getParent()->end();
          Scan != E; ++Scan)
       if (Scan->mayWriteToMemory())
         return false;
   }
 
-  BasicBlock::iterator InsertPos = DestBlock->begin();
-  while (isa<PHINode>(InsertPos)) ++InsertPos;
+  BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI();
 
   I->moveBefore(InsertPos);
   ++NumSunkInst;
@@ -11044,15 +11253,12 @@
 
     // Recursively visit successors.  If this is a branch or switch on a
     // constant, only visit the reachable successor.
-    if (BB->getUnwindDest())
-      Worklist.push_back(BB->getUnwindDest());
     TerminatorInst *TI = BB->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
         bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
         BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
-        if (ReachableBB != BB->getUnwindDest())
-          Worklist.push_back(ReachableBB);
+        Worklist.push_back(ReachableBB);
         continue;
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
@@ -11061,8 +11267,7 @@
         for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
           if (SI->getCaseValue(i) == Cond) {
             BasicBlock *ReachableBB = SI->getSuccessor(i);
-            if (ReachableBB != BB->getUnwindDest())
-              Worklist.push_back(ReachableBB);
+            Worklist.push_back(ReachableBB);
             continue;
           }
         
@@ -11142,8 +11347,20 @@
       continue;
     }
 
+    if (TD && I->getType()->getTypeID() == Type::VoidTyID) {
+      // See if we can constant fold its operands.
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(i)) {
+          if (Constant *NewC = ConstantFoldConstantExpression(CE, TD))
+            i->set(NewC);
+        }
+      }
+    }
+
     // See if we can trivially sink this instruction to a successor basic block.
-    if (I->hasOneUse()) {
+    // FIXME: Remove GetResultInst test when first class support for aggregates
+    // is implemented.
+    if (I->hasOneUse() && !isa<GetResultInst>(I)) {
       BasicBlock *BB = I->getParent();
       BasicBlock *UserParent = cast<Instruction>(I->use_back())->getParent();
       if (UserParent != BB) {
@@ -11248,7 +11465,7 @@
 
   // Iterate while there is work to do.
   unsigned Iteration = 0;
-  while (DoOneIteration(F, Iteration++)) 
+  while (DoOneIteration(F, Iteration++))
     EverMadeChange = true;
   return EverMadeChange;
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/JumpThreading.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/JumpThreading.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/JumpThreading.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/JumpThreading.cpp Sun Jul  6 15:45:41 2008
@@ -17,6 +17,7 @@
 #include "llvm/Pass.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -56,11 +57,18 @@
     bool runOnFunction(Function &F);
     bool ThreadBlock(BasicBlock *BB);
     void ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, BasicBlock *SuccBB);
+    BasicBlock *FactorCommonPHIPreds(PHINode *PN, Constant *CstVal);
+
+    bool ProcessJumpOnPHI(PHINode *PN);
+    bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
+    bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
   };
-  char JumpThreading::ID = 0;
-  RegisterPass<JumpThreading> X("jump-threading", "Jump Threading");
 }
 
+char JumpThreading::ID = 0;
+static RegisterPass<JumpThreading>
+X("jump-threading", "Jump Threading");
+
 // Public interface to the Jump Threading pass
 FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
 
@@ -82,12 +90,34 @@
   return EverChanged;
 }
 
+/// FactorCommonPHIPreds - If there are multiple preds with the same incoming
+/// value for the PHI, factor them together so we get one block to thread for
+/// the whole group.
+/// This is important for things like "phi i1 [true, true, false, true, x]"
+/// where we only need to clone the block for the true blocks once.
+///
+BasicBlock *JumpThreading::FactorCommonPHIPreds(PHINode *PN, Constant *CstVal) {
+  SmallVector<BasicBlock*, 16> CommonPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == CstVal)
+      CommonPreds.push_back(PN->getIncomingBlock(i));
+  
+  if (CommonPreds.size() == 1)
+    return CommonPreds[0];
+    
+  DOUT << "  Factoring out " << CommonPreds.size()
+       << " common predecessors.\n";
+  return SplitBlockPredecessors(PN->getParent(),
+                                &CommonPreds[0], CommonPreds.size(),
+                                ".thr_comm", this);
+}
+  
+
 /// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
 /// thread across it.
 static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
-  BasicBlock::const_iterator I = BB->begin();
   /// Ignore PHI nodes, these will be flattened when duplication happens.
-  while (isa<PHINode>(*I)) ++I;
+  BasicBlock::const_iterator I = BB->getFirstNonPHI();
 
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
@@ -127,7 +157,7 @@
 /// ThreadBlock - If there are any predecessors whose control can be threaded
 /// through to a successor, transform them now.
 bool JumpThreading::ThreadBlock(BasicBlock *BB) {
-  // See if this block ends with a branch of switch.  If so, see if the
+  // See if this block ends with a branch or switch.  If so, see if the
   // condition is a phi node.  If so, and if an entry of the phi node is a
   // constant, we can thread the block.
   Value *Condition;
@@ -141,7 +171,7 @@
     return false; // Must be an invoke.
   
   // If the terminator of this block is branching on a constant, simplify the
-  // terminator to an unconditional branch.  This can occur do to threading in
+  // terminator to an unconditional branch.  This can occur due to threading in
   // other blocks.
   if (isa<ConstantInt>(Condition)) {
     DOUT << "  In block '" << BB->getNameStart()
@@ -157,25 +187,50 @@
 
   // See if this is a phi node in the current block.
   PHINode *PN = dyn_cast<PHINode>(Condition);
-  if (!PN || PN->getParent() != BB) return false;
+  if (PN && PN->getParent() == BB)
+    return ProcessJumpOnPHI(PN);
+  
+  // If this is a conditional branch whose condition is and/or of a phi, try to
+  // simplify it.
+  if (BinaryOperator *CondI = dyn_cast<BinaryOperator>(Condition)) {
+    if ((CondI->getOpcode() == Instruction::And || 
+         CondI->getOpcode() == Instruction::Or) &&
+        isa<BranchInst>(BB->getTerminator()) &&
+        ProcessBranchOnLogical(CondI, BB,
+                               CondI->getOpcode() == Instruction::And))
+      return true;
+  }
+  
+  // If we have "br (phi != 42)" and the phi node has any constant values as 
+  // operands, we can thread through this block.
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(Condition))
+    if (isa<PHINode>(CondCmp->getOperand(0)) &&
+        isa<Constant>(CondCmp->getOperand(1)) &&
+        ProcessBranchOnCompare(CondCmp, BB))
+      return true;
   
+  return false;
+}
+
+/// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
+/// the current block.  See if there are any simplifications we can do based on
+/// inputs to the phi node.
+/// 
+bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
   // See if the phi node has any constant values.  If so, we can determine where
   // the corresponding predecessor will branch.
-  unsigned PredNo = ~0U;
   ConstantInt *PredCst = 0;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    if ((PredCst = dyn_cast<ConstantInt>(PN->getIncomingValue(i)))) {
-      PredNo = i;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if ((PredCst = dyn_cast<ConstantInt>(PN->getIncomingValue(i))))
       break;
-    }
-  }
   
   // If no incoming value has a constant, we don't know the destination of any
   // predecessors.
-  if (PredNo == ~0U)
+  if (PredCst == 0)
     return false;
   
   // See if the cost of duplicating this block is low enough.
+  BasicBlock *BB = PN->getParent();
   unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
   if (JumpThreadCost > Threshold) {
     DOUT << "  Not threading BB '" << BB->getNameStart()
@@ -183,9 +238,11 @@
     return false;
   }
   
-  // If so, we can actually do this threading.  Figure out which predecessor and
-  // which successor we are threading for.
-  BasicBlock *PredBB = PN->getIncomingBlock(PredNo);
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.
   BasicBlock *SuccBB;
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
     SuccBB = BI->getSuccessor(PredCst == ConstantInt::getFalse());
@@ -194,21 +251,180 @@
     SuccBB = SI->getSuccessor(SI->findCaseValue(PredCst));
   }
   
-  // TODO: If there are multiple preds with the same incoming value for the PHI,
-  // factor them together so we get one thread block for the whole group.  This
-  // is important for things like "phi i1 [true, true, false, true, x]" where
-  // we only need to clone the block for the true blocks once.
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - would thread to self!\n";
+    return false;
+  }
   
+  // And finally, do it!
   DOUT << "  Threading edge from '" << PredBB->getNameStart() << "' to '"
        << SuccBB->getNameStart() << "' with cost: " << JumpThreadCost
        << ", across block:\n    "
-       << *BB;
+       << *BB << "\n";
        
   ThreadEdge(BB, PredBB, SuccBB);
   ++NumThreads;
   return true;
 }
 
+/// ProcessJumpOnLogicalPHI - PN's basic block contains a conditional branch
+/// whose condition is an AND/OR where one side is PN.  If PN has constant
+/// operands that permit us to evaluate the condition for some operand, thread
+/// through the block.  For example with:
+///   br (and X, phi(Y, Z, false))
+/// the predecessor corresponding to the 'false' will always jump to the false
+/// destination of the branch.
+///
+bool JumpThreading::ProcessBranchOnLogical(Value *V, BasicBlock *BB,
+                                           bool isAnd) {
+  // If this is a binary operator tree of the same AND/OR opcode, check the
+  // LHS/RHS.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if ((isAnd && BO->getOpcode() == Instruction::And) ||
+        (!isAnd && BO->getOpcode() == Instruction::Or)) {
+      if (ProcessBranchOnLogical(BO->getOperand(0), BB, isAnd))
+        return true;
+      if (ProcessBranchOnLogical(BO->getOperand(1), BB, isAnd))
+        return true;
+    }
+      
+  // If this isn't a PHI node, we can't handle it.
+  PHINode *PN = dyn_cast<PHINode>(V);
+  if (!PN || PN->getParent() != BB) return false;
+                                             
+  // We can only do the simplification for phi nodes of 'false' with AND or
+  // 'true' with OR.  See if we have any entries in the phi for this.
+  unsigned PredNo = ~0U;
+  ConstantInt *PredCst = ConstantInt::get(Type::Int1Ty, !isAnd);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == PredCst) {
+      PredNo = i;
+      break;
+    }
+  }
+  
+  // If no match, bail out.
+  if (PredNo == ~0U)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.  If this was an AND,
+  // the constant must be FALSE, and we must be targeting the 'false' block.
+  // If this is an OR, the constant must be TRUE, and we must be targeting the
+  // 'true' block.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(isAnd);
+  
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+    << "' - would thread to self!\n";
+    return false;
+  }
+  
+  // And finally, do it!
+  DOUT << "  Threading edge through bool from '" << PredBB->getNameStart()
+       << "' to '" << SuccBB->getNameStart() << "' with cost: "
+       << JumpThreadCost << ", across block:\n    "
+       << *BB << "\n";
+  
+  ThreadEdge(BB, PredBB, SuccBB);
+  ++NumThreads;
+  return true;
+}
+
+/// ProcessBranchOnCompare - We found a branch on a comparison between a phi
+/// node and a constant.  If the PHI node contains any constants as inputs, we
+/// can fold the compare for that edge and thread through it.
+bool JumpThreading::ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB) {
+  PHINode *PN = cast<PHINode>(Cmp->getOperand(0));
+  Constant *RHS = cast<Constant>(Cmp->getOperand(1));
+  
+  // If the phi isn't in the current block, an incoming edge to this block
+  // doesn't control the destination.
+  if (PN->getParent() != BB)
+    return false;
+  
+  // We can do this simplification if any comparisons fold to true or false.
+  // See if any do.
+  Constant *PredCst = 0;
+  bool TrueDirection = false;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    PredCst = dyn_cast<Constant>(PN->getIncomingValue(i));
+    if (PredCst == 0) continue;
+    
+    Constant *Res;
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cmp))
+      Res = ConstantExpr::getICmp(ICI->getPredicate(), PredCst, RHS);
+    else
+      Res = ConstantExpr::getFCmp(cast<FCmpInst>(Cmp)->getPredicate(),
+                                  PredCst, RHS);
+    // If this folded to a constant expr, we can't do anything.
+    if (ConstantInt *ResC = dyn_cast<ConstantInt>(Res)) {
+      TrueDirection = ResC->getZExtValue();
+      break;
+    }
+    // If this folded to undef, just go the false way.
+    if (isa<UndefValue>(Res)) {
+      TrueDirection = false;
+      break;
+    }
+    
+    // Otherwise, we can't fold this input.
+    PredCst = 0;
+  }
+  
+  // If no match, bail out.
+  if (PredCst == 0)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, get our successor.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(!TrueDirection);
+  
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+    << "' - would thread to self!\n";
+    return false;
+  }
+  
+  
+  // And finally, do it!
+  DOUT << "  Threading edge through bool from '" << PredBB->getNameStart()
+       << "' to '" << SuccBB->getNameStart() << "' with cost: "
+       << JumpThreadCost << ", across block:\n    "
+       << *BB << "\n";
+  
+  ThreadEdge(BB, PredBB, SuccBB);
+  ++NumThreads;
+  return true;
+}
+
+
 /// ThreadEdge - We have decided that it is safe and profitable to thread an
 /// edge from PredBB to SuccBB across BB.  Transform the IR to reflect this
 /// change.
@@ -218,12 +434,27 @@
   // Jump Threading can not update SSA properties correctly if the values
   // defined in the duplicated block are used outside of the block itself.  For
   // this reason, we spill all values that are used outside of BB to the stack.
-  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-    if (I->isUsedOutsideOfBlock(BB)) {
-      // We found a use of I outside of BB.  Create a new stack slot to
-      // break this inter-block usage pattern.
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    if (!I->isUsedOutsideOfBlock(BB))
+      continue;
+    
+    // We found a use of I outside of BB.  Create a new stack slot to
+    // break this inter-block usage pattern.
+    if (!isa<StructType>(I->getType())) {
       DemoteRegToStack(*I);
+      continue;
     }
+    
+    // Alternatively, I must be a call or invoke that returns multiple retvals.
+    // We can't use 'DemoteRegToStack' because that will create loads and
+    // stores of aggregates which is not valid yet.  If I is a call, we can just
+    // pull all the getresult instructions up to this block.  If I is an invoke,
+    // we are out of luck.
+    BasicBlock::iterator IP = I; ++IP;
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+         UI != E; ++UI)
+      cast<GetResultInst>(UI)->moveBefore(IP);
+  }
  
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LICM.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LICM.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LICM.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LICM.cpp Sun Jul  6 15:45:41 2008
@@ -58,11 +58,11 @@
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
 STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
 
-namespace {
-  cl::opt<bool>
-  DisablePromotion("disable-licm-promotion", cl::Hidden,
-                   cl::desc("Disable memory promotion in LICM pass"));
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+                 cl::desc("Disable memory promotion in LICM pass"));
 
+namespace {
   struct VISIBILITY_HIDDEN LICM : public LoopPass {
     static char ID; // Pass identification, replacement for typeid
     LICM() : LoopPass((intptr_t)&ID) {}
@@ -216,11 +216,11 @@
                    std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
                                     std::map<Value*, AllocaInst*> &Val2AlMap);
   };
-
-  char LICM::ID = 0;
-  RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
 }
 
+char LICM::ID = 0;
+static RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
+
 LoopPass *llvm::createLICMPass() { return new LICM(); }
 
 /// Hoist expressions out of the specified loop. Note, alias info for inner
@@ -258,10 +258,12 @@
   // Because subloops have already been incorporated into AST, we skip blocks in
   // subloops.
   //
-  for (std::vector<BasicBlock*>::const_iterator I = L->getBlocks().begin(),
-         E = L->getBlocks().end(); I != E; ++I)
-    if (LI->getLoopFor(*I) == L)        // Ignore blocks in subloops...
-      CurAST->add(**I);                 // Incorporate the specified basic block
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops...
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
 
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
@@ -472,8 +474,7 @@
       // nodes in it.
       I.removeFromParent();
 
-      BasicBlock::iterator InsertPt = ExitBlocks[0]->begin();
-      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      BasicBlock::iterator InsertPt = ExitBlocks[0]->getFirstNonPHI();
       ExitBlocks[0]->getInstList().insert(InsertPt, &I);
     }
   } else if (ExitBlocks.empty()) {
@@ -542,8 +543,7 @@
         // If we haven't already processed this exit block, do so now.
         if (InsertedBlocks.insert(ExitBlock).second) {
           // Insert the code after the last PHI node...
-          BasicBlock::iterator InsertPt = ExitBlock->begin();
-          while (isa<PHINode>(InsertPt)) ++InsertPt;
+          BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
 
           // If this is the first exit block processed, just move the original
           // instruction, otherwise clone the original instruction and insert
@@ -700,12 +700,11 @@
   // Scan the basic blocks in the loop, replacing uses of our pointers with
   // uses of the allocas in question.
   //
-  const std::vector<BasicBlock*> &LoopBBs = CurLoop->getBlocks();
-  for (std::vector<BasicBlock*>::const_iterator I = LoopBBs.begin(),
-         E = LoopBBs.end(); I != E; ++I) {
+  for (Loop::block_iterator I = CurLoop->block_begin(),
+         E = CurLoop->block_end(); I != E; ++I) {
+    BasicBlock *BB = *I;
     // Rewrite all loads and stores in the block of the pointer...
-    for (BasicBlock::iterator II = (*I)->begin(), E = (*I)->end();
-         II != E; ++II) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
       if (LoadInst *L = dyn_cast<LoadInst>(II)) {
         std::map<Value*, AllocaInst*>::iterator
           I = ValueToAllocaMap.find(L->getOperand(0));
@@ -726,30 +725,30 @@
   // want to insert one copy of the code in each exit block, though the loop may
   // exit to the same block more than once.
   //
-  std::set<BasicBlock*> ProcessedBlocks;
+  SmallPtrSet<BasicBlock*, 16> ProcessedBlocks;
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   CurLoop->getExitBlocks(ExitBlocks);
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (ProcessedBlocks.insert(ExitBlocks[i]).second) {
-      // Copy all of the allocas into their memory locations.
-      BasicBlock::iterator BI = ExitBlocks[i]->begin();
-      while (isa<PHINode>(*BI))
-        ++BI;             // Skip over all of the phi nodes in the block.
-      Instruction *InsertPos = BI;
-      unsigned PVN = 0;
-      for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
-        // Load from the alloca.
-        LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
-
-        // If this is a pointer type, update alias info appropriately.
-        if (isa<PointerType>(LI->getType()))
-          CurAST->copyValue(PointerValueNumbers[PVN++], LI);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    if (!ProcessedBlocks.insert(ExitBlocks[i]))
+      continue;
+  
+    // Copy all of the allocas into their memory locations.
+    BasicBlock::iterator BI = ExitBlocks[i]->getFirstNonPHI();
+    Instruction *InsertPos = BI;
+    unsigned PVN = 0;
+    for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+      // Load from the alloca.
+      LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
+
+      // If this is a pointer type, update alias info appropriately.
+      if (isa<PointerType>(LI->getType()))
+        CurAST->copyValue(PointerValueNumbers[PVN++], LI);
 
-        // Store into the memory we promoted.
-        new StoreInst(LI, PromotedValues[i].second, InsertPos);
-      }
+      // Store into the memory we promoted.
+      new StoreInst(LI, PromotedValues[i].second, InsertPos);
     }
+  }
 
   // Now that we have done the deed, use the mem2reg functionality to promote
   // all of the new allocas we just created into real SSA registers.
@@ -771,14 +770,8 @@
                              std::map<Value*, AllocaInst*> &ValueToAllocaMap) {
   Instruction *FnStart = CurLoop->getHeader()->getParent()->begin()->begin();
 
-  SmallVector<Instruction *, 4> LoopExits;
-  SmallVector<BasicBlock *, 4> Blocks;
-  CurLoop->getExitingBlocks(Blocks);
-  for (SmallVector<BasicBlock *, 4>::iterator BI = Blocks.begin(),
-         BE = Blocks.end(); BI != BE; ++BI) {
-    BasicBlock *BB = *BI;
-    LoopExits.push_back(BB->getTerminator());
-  }
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  CurLoop->getExitingBlocks(ExitingBlocks);
 
   // Loop over all of the alias sets in the tracker object.
   for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
@@ -787,72 +780,76 @@
     // We can promote this alias set if it has a store, if it is a "Must" alias
     // set, if the pointer is loop invariant, and if we are not eliminating any
     // volatile loads or stores.
-    if (!AS.isForwardingAliasSet() && AS.isMod() && AS.isMustAlias() &&
-        !AS.isVolatile() && CurLoop->isLoopInvariant(AS.begin()->first)) {
-      assert(!AS.empty() &&
-             "Must alias set should have at least one pointer element in it!");
-      Value *V = AS.begin()->first;
-
-      // Check that all of the pointers in the alias set have the same type.  We
-      // cannot (yet) promote a memory location that is loaded and stored in
-      // different sizes.
+    if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+        AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->first))
+      continue;
+    
+    assert(!AS.empty() &&
+           "Must alias set should have at least one pointer element in it!");
+    Value *V = AS.begin()->first;
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    {
       bool PointerOk = true;
       for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
         if (V->getType() != I->first->getType()) {
           PointerOk = false;
           break;
         }
+      if (!PointerOk)
+        continue;
+    }
 
-      // If one use of value V inside the loop is safe then it is OK to promote 
-      // this value. On the otherside if there is not any unsafe use inside the
-      // loop then also it is OK to promote this value. Otherwise it is
-      // unsafe to promote this value.
-      if (PointerOk) {
-        bool oneSafeUse = false;
-        bool oneUnsafeUse = false;
-        for(Value::use_iterator UI = V->use_begin(), UE = V->use_end();
-            UI != UE; ++UI) {
-          Instruction *Use = dyn_cast<Instruction>(*UI);
-          if (!Use || !CurLoop->contains(Use->getParent()))
-            continue;
-          for (SmallVector<Instruction *, 4>::iterator 
-                 ExitI = LoopExits.begin(), ExitE = LoopExits.end();
-               ExitI != ExitE; ++ExitI) {
-            Instruction *Ex = *ExitI;
-            if (!isa<PHINode>(Use) && DT->dominates(Use, Ex)) {
-              oneSafeUse = true;
-              break;
-            }
-            else 
-              oneUnsafeUse = true;
-          }
-
-          if (oneSafeUse)
-            break;
-        }
-
-        if (oneSafeUse)
-          PointerOk = true;
-        else if (!oneUnsafeUse)
-          PointerOk = true;
-        else
-          PointerOk = false;
+    // It isn't safe to promote a load/store from the loop if the load/store is
+    // conditional.  For example, turning:
+    //
+    //    for () { if (c) *P += 1; }
+    //
+    // into:
+    //
+    //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+    //
+    // is not safe, because *P may only be valid to access if 'c' is true.
+    // 
+    // It is safe to promote P if all uses are direct load/stores and if at
+    // least one is guaranteed to be executed.
+    bool GuaranteedToExecute = false;
+    bool InvalidInst = false;
+    for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      // Ignore instructions not in this loop.
+      Instruction *Use = dyn_cast<Instruction>(*UI);
+      if (!Use || !CurLoop->contains(Use->getParent()))
+        continue;
+
+      if (!isa<LoadInst>(Use) && !isa<StoreInst>(Use)) {
+        InvalidInst = true;
+        break;
       }
       
-      if (PointerOk) {
-        const Type *Ty = cast<PointerType>(V->getType())->getElementType();
-        AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
-        PromotedValues.push_back(std::make_pair(AI, V));
+      if (!GuaranteedToExecute)
+        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
+    }
 
-        // Update the AST and alias analysis.
-        CurAST->copyValue(V, AI);
+    // If there is an non-load/store instruction in the loop, we can't promote
+    // it.  If there isn't a guaranteed-to-execute instruction, we can't
+    // promote.
+    if (InvalidInst || !GuaranteedToExecute)
+      continue;
+    
+    const Type *Ty = cast<PointerType>(V->getType())->getElementType();
+    AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
+    PromotedValues.push_back(std::make_pair(AI, V));
 
-        for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
-          ValueToAllocaMap.insert(std::make_pair(I->first, AI));
+    // Update the AST and alias analysis.
+    CurAST->copyValue(V, AI);
 
-        DOUT << "LICM: Promoting value: " << *V << "\n";
-      }
-    }
+    for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+      ValueToAllocaMap.insert(std::make_pair(I->first, AI));
+
+    DOUT << "LICM: Promoting value: " << *V << "\n";
   }
 }
 

Added: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopDeletion.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopDeletion.cpp?rev=53163&view=auto

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopDeletion.cpp (added)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopDeletion.cpp Sun Jul  6 15:45:41 2008
@@ -0,0 +1,280 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-delete"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+  class VISIBILITY_HIDDEN LoopDeletion : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopDeletion() : LoopPass((intptr_t)&ID) { }
+    
+    // Possibly eliminate loop L if it is dead.
+    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    
+    bool SingleDominatingExit(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks);
+    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
+                    SmallVector<BasicBlock*, 4>& exitBlocks);
+    bool IsLoopInvariantInst(Instruction *I, Loop* L);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+    }
+  };
+}
+  
+char LoopDeletion::ID = 0;
+static RegisterPass<LoopDeletion> X("loop-deletion", "Delete dead loops");
+
+LoopPass* llvm::createLoopDeletionPass() {
+  return new LoopDeletion();
+}
+
+/// SingleDominatingExit - Checks that there is only a single blocks that 
+/// branches out of the loop, and that it also g the latch block.  Loops
+/// with multiple or non-latch-dominating exiting blocks could be dead, but we'd
+/// have to do more extensive analysis to make sure, for instance, that the 
+/// control flow logic involved was or could be made loop-invariant.
+bool LoopDeletion::SingleDominatingExit(Loop* L,
+                                   SmallVector<BasicBlock*, 4>& exitingBlocks) {
+  
+  if (exitingBlocks.size() != 1)
+    return false;
+  
+  BasicBlock* latch = L->getLoopLatch();
+  if (!latch)
+    return false;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  return DT.dominates(exitingBlocks[0], latch);
+}
+
+/// IsLoopInvariantInst - Checks if an instruction is invariant with respect to
+/// a loop, which is defined as being true if all of its operands are defined
+/// outside of the loop.  These instructions can be hoisted out of the loop
+/// if their results are needed.  This could be made more aggressive by
+/// recursively checking the operands for invariance, but it's not clear that
+/// it's worth it.
+bool LoopDeletion::IsLoopInvariantInst(Instruction *I, Loop* L)  {
+  // PHI nodes are not loop invariant if defined in  the loop.
+  if (isa<PHINode>(I) && L->contains(I->getParent()))
+    return false;
+    
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!L->isLoopInvariant(I->getOperand(i)))
+      return false;
+  
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::IsLoopDead(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks,
+                              SmallVector<BasicBlock*, 4>& exitBlocks) {
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock* exitBlock = exitBlocks[0];
+  
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    Value* incoming = P->getIncomingValueForBlock(exitingBlock);
+    if (Instruction* I = dyn_cast<Instruction>(incoming))
+      if (!IsLoopInvariantInst(I, L))
+        return false;
+      
+    BI++;
+  }
+  
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.  This could be made more aggressive by using aliasing
+  // information to identify readonly and readnone calls.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI) {
+      if (BI->mayWriteToMemory())
+        return false;
+      else if (LoadInst* L = dyn_cast<LoadInst>(BI))
+        if (L->isVolatile())
+          return false;
+    }
+  }
+  
+  return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time.  Note 
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+  // We can only remove the loop if there is a preheader that we can 
+  // branch from after removing it.
+  BasicBlock* preheader = L->getLoopPreheader();
+  if (!preheader)
+    return false;
+  
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end())
+    return false;
+  
+  SmallVector<BasicBlock*, 4> exitingBlocks;
+  L->getExitingBlocks(exitingBlocks);
+  
+  SmallVector<BasicBlock*, 4> exitBlocks;
+  L->getUniqueExitBlocks(exitBlocks);
+  
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (exitBlocks.size() != 1)
+    return false;
+  
+  // Loops with multiple exits or exits that don't dominate the latch
+  // are too complicated to handle correctly.
+  if (!SingleDominatingExit(L, exitingBlocks))
+    return false;
+  
+  // Finally, we have to check that the loop really is dead.
+  if (!IsLoopDead(L, exitingBlocks, exitBlocks))
+    return false;
+  
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  SCEVHandle S = SE.getIterationCount(L);
+  if (isa<SCEVCouldNotCompute>(S))
+    return false;
+  
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.  
+  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.  Don't
+  // mess with this unless you have good reason and know what you're doing.
+  
+  // Move simple loop-invariant expressions out of the loop, since they
+  // might be needed by the exit phis.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ) {
+      Instruction* I = BI++;
+      if (!I->use_empty() && IsLoopInvariantInst(I, L))
+        I->moveBefore(preheader->getTerminator());
+    }
+  
+  // Connect the preheader directly to the exit block.
+  TerminatorInst* TI = preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    P->replaceUsesOfWith(exitingBlock, preheader);
+    BI++;
+  }
+  
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  SmallPtrSet<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(DT[*LI]->begin(), DT[*LI]->end());
+    for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+         DE = ChildNodes.end(); DI != DE; ++DI)
+      DT.changeImmediateDominator(*DI, DT[preheader]);
+    
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+    
+    // Remove instructions that we're deleting from ScalarEvolution.
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI)
+      SE.deleteValueFromRecords(BI);
+    
+    SE.deleteValueFromRecords(*LI);
+    
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+  
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+  
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  SmallPtrSet<BasicBlock*, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
+       E = blocks.end(); I != E; ++I)
+    loopInfo.removeBlock(*I);
+  
+  // The last step is to inform the loop pass manager that we've
+  // eliminated this loop.
+  LPM.deleteLoopFromQueue(L);
+  
+  NumDeleted++;
+  
+  return true;
+}

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopIndexSplit.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopIndexSplit.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopIndexSplit.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopIndexSplit.cpp Sun Jul  6 15:45:41 2008
@@ -195,11 +195,12 @@
     // Induction variable's final loop exit value operand number in exit condition..
     unsigned ExitValueNum;
   };
-
-  char LoopIndexSplit::ID = 0;
-  RegisterPass<LoopIndexSplit> X ("loop-index-split", "Index Split Loops");
 }
 
+char LoopIndexSplit::ID = 0;
+static RegisterPass<LoopIndexSplit>
+X("loop-index-split", "Index Split Loops");
+
 LoopPass *llvm::createLoopIndexSplitPass() {
   return new LoopIndexSplit();
 }
@@ -580,7 +581,7 @@
                                  ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
                                  SD.SplitValue, ExitValue, "lisplit", 
                                  Terminator);
-  Instruction *NSplitCond = BinaryOperator::createAnd(C1, C2, "lisplit", 
+  Instruction *NSplitCond = BinaryOperator::CreateAnd(C1, C2, "lisplit", 
                                                       Terminator);
   SD.SplitCondition->replaceAllUsesWith(NSplitCond);
   SD.SplitCondition->eraseFromParent();
@@ -595,11 +596,27 @@
     if (isa<PHINode>(I) || I == LTerminator)
       continue;
 
-    if (I == IndVarIncrement) 
-      I->replaceAllUsesWith(ExitValue);
-    else
+    if (I == IndVarIncrement) {
+      // Replace induction variable increment if it is not used outside 
+      // the loop.
+      bool UsedOutsideLoop = false;
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); 
+           UI != E; ++UI) {
+        if (Instruction *Use = dyn_cast<Instruction>(UI)) 
+          if (!L->contains(Use->getParent())) {
+            UsedOutsideLoop = true;
+            break;
+          }
+      }
+      if (!UsedOutsideLoop) {
+        I->replaceAllUsesWith(ExitValue);
+        I->eraseFromParent();
+      }
+    }
+    else {
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
-    I->eraseFromParent();
+      I->eraseFromParent();
+    }
   }
 
   LPM->deleteLoopFromQueue(L);
@@ -768,7 +785,7 @@
     //
     if (ExitCondition->getPredicate() == ICmpInst::ICMP_SLT
         || ExitCondition->getPredicate() == ICmpInst::ICMP_ULT) {
-      Value *A = BinaryOperator::createAdd(NV, ConstantInt::get(Ty, 1, Sign),
+      Value *A = BinaryOperator::CreateAdd(NV, ConstantInt::get(Ty, 1, Sign),
                                            "lsplit.add", PHTerminator);
       Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
                               A, UB,"lsplit,c", PHTerminator);
@@ -820,7 +837,7 @@
     //
     else if (ExitCondition->getPredicate() == ICmpInst::ICMP_SLE
              || ExitCondition->getPredicate() == ICmpInst::ICMP_ULE) {
-      Value *S = BinaryOperator::createSub(NV, ConstantInt::get(Ty, 1, Sign),
+      Value *S = BinaryOperator::CreateSub(NV, ConstantInt::get(Ty, 1, Sign),
                                            "lsplit.add", PHTerminator);
       Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
                               S, UB, "lsplit.c", PHTerminator);
@@ -856,7 +873,7 @@
     //   LOOP_BODY
     //
     {
-      Value *A = BinaryOperator::createAdd(NV, ConstantInt::get(Ty, 1, Sign),
+      Value *A = BinaryOperator::CreateAdd(NV, ConstantInt::get(Ty, 1, Sign),
                                            "lsplit.add", PHTerminator);
       Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
                               A, StartValue, "lsplit.c", PHTerminator);
@@ -1124,6 +1141,11 @@
   BasicBlock *Succ0 = SplitTerminator->getSuccessor(0);
   BasicBlock *Succ1 = SplitTerminator->getSuccessor(1);
 
+  // If split block does not dominate the latch then this is not a diamond.
+  // Such loop may not benefit from index split.
+  if (!DT->dominates(SplitCondBlock, Latch))
+    return false;
+
   // Finally this split condition is safe only if merge point for
   // split condition branch is loop latch. This check along with previous
   // check, to ensure that exit condition is in either loop latch or header,
@@ -1207,7 +1229,7 @@
           //       A;
           // for (i = max(LB, BSV); i < UB; ++i) 
           //       B;
-          BSV = BinaryOperator::createAdd(SD.SplitValue,
+          BSV = BinaryOperator::CreateAdd(SD.SplitValue,
                                           ConstantInt::get(Ty, 1, Sign),
                                           "lsplit.add", PHTerminator);
           AEV = BSV;
@@ -1238,7 +1260,7 @@
           //       B;
           // for (i = max(LB, BSV); i < UB; ++i) 
           //       A;
-          BSV = BinaryOperator::createAdd(SD.SplitValue,
+          BSV = BinaryOperator::CreateAdd(SD.SplitValue,
                                           ConstantInt::get(Ty, 1, Sign),
                                           "lsplit.add", PHTerminator);
           AEV = BSV;
@@ -1266,7 +1288,7 @@
         //       A;
         // for (i = max(LB, BSV); i <= UB; ++i) 
         //       B;
-        AEV = BinaryOperator::createSub(SD.SplitValue,
+        AEV = BinaryOperator::CreateSub(SD.SplitValue,
                                         ConstantInt::get(Ty, 1, Sign),
                                         "lsplit.sub", PHTerminator);
         break;
@@ -1282,7 +1304,7 @@
         //       A;
         // for (i = max(LB, BSV); i <= UB; ++i) 
         //       B;
-        BSV = BinaryOperator::createAdd(SD.SplitValue,
+        BSV = BinaryOperator::CreateAdd(SD.SplitValue,
                                         ConstantInt::get(Ty, 1, Sign),
                                         "lsplit.add", PHTerminator);
         break;
@@ -1298,7 +1320,7 @@
         //      B;
         // for (i = max(LB, BSV); i <= UB; ++i)
         //      A;
-        BSV = BinaryOperator::createAdd(SD.SplitValue,
+        BSV = BinaryOperator::CreateAdd(SD.SplitValue,
                                         ConstantInt::get(Ty, 1, Sign),
                                         "lsplit.add", PHTerminator);
         break;
@@ -1315,7 +1337,7 @@
         //      B;
         // for (i = max(LB, BSV); i <= UB; ++i)
         //      A;
-        AEV = BinaryOperator::createSub(SD.SplitValue,
+        AEV = BinaryOperator::CreateSub(SD.SplitValue,
                                         ConstantInt::get(Ty, 1, Sign),
                                         "lsplit.sub", PHTerminator);
         break;
@@ -1377,7 +1399,7 @@
 
   BasicBlock *SplitCondBlock = SD.SplitCondition->getParent();
   
-  // Unable to handle triange loops at the moment.
+  // Unable to handle triangle loops at the moment.
   // In triangle loop, split condition is in header and one of the
   // the split destination is loop latch. If split condition is EQ
   // then such loops are already handle in processOneIterationLoop().

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopRotation.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopRotation.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopRotation.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopRotation.cpp Sun Jul  6 15:45:41 2008
@@ -102,10 +102,10 @@
     LPPassManager *LPM_Ptr;
     SmallVector<RenameData, MAX_HEADER_SIZE> LoopHeaderInfo;
   };
-  
-  char LoopRotate::ID = 0;
-  RegisterPass<LoopRotate> X ("loop-rotate", "Rotate Loops");
 }
+  
+char LoopRotate::ID = 0;
+static RegisterPass<LoopRotate> X("loop-rotate", "Rotate Loops");
 
 LoopPass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 
@@ -164,7 +164,7 @@
 
   // Check size of original header and reject
   // loop if it is very big.
-  if (OrigHeader->getInstList().size() > MAX_HEADER_SIZE)
+  if (OrigHeader->size() > MAX_HEADER_SIZE)
     return false;
 
   // Now, this loop is suitable for rotation.
@@ -208,10 +208,10 @@
     // Create new PHI node with two incoming values for NewHeader.
     // One incoming value is from OrigLatch (through OrigHeader) and 
     // second incoming value is from original pre-header.
-    PHINode *NH = PHINode::Create(In->getType(), In->getName());
+    PHINode *NH = PHINode::Create(In->getType(), In->getName(),
+                                  NewHeader->begin());
     NH->addIncoming(PN->getIncomingValueForBlock(OrigLatch), OrigHeader);
     NH->addIncoming(NPV, OrigPreHeader);
-    NewHeader->getInstList().push_front(NH);
     
     // "In" can be replaced by NH at various places.
     LoopHeaderInfo.push_back(RenameData(In, NPV, NH));
@@ -249,14 +249,36 @@
     // create new PHINode for this instruction.
     Instruction *NewHeaderReplacement = NULL;
     if (usedOutsideOriginalHeader(In)) {
-      PHINode *PN = PHINode::Create(In->getType(), In->getName());
-      PN->addIncoming(In, OrigHeader);
-      PN->addIncoming(C, OrigPreHeader);
-      NewHeader->getInstList().push_front(PN);
-      NewHeaderReplacement = PN;
-    } 
-    
-    // "In" can be replaced by NPH or NH at various places.
+      // FIXME: remove this when we have first-class aggregates.
+      if (isa<StructType>(In->getType())) {
+        // Can't create PHI nodes for this type.  If there are any getResults
+        // not defined in this block, move them back to this block.  PHI
+        // nodes will be created for all getResults later.
+        BasicBlock::iterator InsertPoint;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(In)) {
+          InsertPoint = II->getNormalDest()->getFirstNonPHI();
+        } else {
+          InsertPoint = I;  // call
+          ++InsertPoint;
+        }
+        for (Value::use_iterator UI = In->use_begin(), UE = In->use_end();
+             UI != UE; ++UI) {
+          GetResultInst *InGR = cast<GetResultInst>(UI);
+          if (InGR->getParent() != OrigHeader) {
+            // Move InGR to immediately after the call or in the normal dest of
+            // the invoke.  It will be picked up, cloned and PHI'd on the next
+            // iteration.
+            InGR->moveBefore(InsertPoint);
+          }
+        }
+      } else {
+        PHINode *PN = PHINode::Create(In->getType(), In->getName(),
+                                      NewHeader->begin());
+        PN->addIncoming(In, OrigHeader);
+        PN->addIncoming(C, OrigPreHeader);
+        NewHeaderReplacement = PN;
+      }
+    }
     LoopHeaderInfo.push_back(RenameData(In, C, NewHeaderReplacement));
   }
 
@@ -336,10 +358,10 @@
       } else {
         // Used outside Exit block. Create a new PHI node from exit block
         // to receive value from ne new header ane pre header.
-        PHINode *PN = PHINode::Create(U->getType(), U->getName());
+        PHINode *PN = PHINode::Create(U->getType(), U->getName(),
+                                      Exit->begin());
         PN->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
         PN->addIncoming(OldPhi, OrigHeader);
-        Exit->getInstList().push_front(PN);
         U->replaceUsesOfWith(OldPhi, PN);
       }
     }
@@ -447,7 +469,8 @@
   // Right now original pre-header has two successors, new header and
   // exit block. Insert new block between original pre-header and
   // new header such that loop's new pre-header has only one successor.
-  BasicBlock *NewPreHeader = BasicBlock::Create("bb.nph", OrigHeader->getParent(), 
+  BasicBlock *NewPreHeader = BasicBlock::Create("bb.nph",
+                                                OrigHeader->getParent(), 
                                                 NewHeader);
   LoopInfo &LI = LPM.getAnalysis<LoopInfo>();
   if (Loop *PL = LI.getLoopFor(OrigPreHeader))
@@ -560,14 +583,15 @@
   BasicBlock::iterator I = Exit->begin(), E = Exit->end();
   PHINode *PN = NULL;
   for (; (PN = dyn_cast<PHINode>(I)); ++I) {
-    PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName());
     unsigned N = PN->getNumIncomingValues();
     for (unsigned index = 0; index < N; ++index)
       if (PN->getIncomingBlock(index) == NExit) {
+        PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName(),
+                                         NExit->begin());
         NewPN->addIncoming(PN->getIncomingValue(index), L->getLoopLatch());
         PN->setIncomingValue(index, NewPN);
         PN->setIncomingBlock(index, NExit);
-        NExit->getInstList().push_front(NewPN);
+        break;
       }
   }
 

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopStrengthReduce.cpp Sun Jul  6 15:45:41 2008
@@ -31,6 +31,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
@@ -136,7 +137,7 @@
 
     /// DeadInsts - Keep track of instructions we may have made dead, so that
     /// we can remove them after we are done working.
-    SmallPtrSet<Instruction*,16> DeadInsts;
+    SetVector<Instruction*> DeadInsts;
 
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
@@ -192,12 +193,14 @@
     void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
                                       IVUsersOfOneStride &Uses,
                                       Loop *L, bool isOnlyStride);
-    void DeleteTriviallyDeadInstructions(SmallPtrSet<Instruction*,16> &Insts);
+    void DeleteTriviallyDeadInstructions(SetVector<Instruction*> &Insts);
   };
-  char LoopStrengthReduce::ID = 0;
-  RegisterPass<LoopStrengthReduce> X("loop-reduce", "Loop Strength Reduction");
 }
 
+char LoopStrengthReduce::ID = 0;
+static RegisterPass<LoopStrengthReduce>
+X("loop-reduce", "Loop Strength Reduction");
+
 LoopPass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
   return new LoopStrengthReduce(TLI);
 }
@@ -224,10 +227,10 @@
 /// specified set are trivially dead, delete them and see if this makes any of
 /// their operands subsequently dead.
 void LoopStrengthReduce::
-DeleteTriviallyDeadInstructions(SmallPtrSet<Instruction*,16> &Insts) {
+DeleteTriviallyDeadInstructions(SetVector<Instruction*> &Insts) {
   while (!Insts.empty()) {
-    Instruction *I = *Insts.begin();
-    Insts.erase(I);
+    Instruction *I = Insts.back();
+    Insts.pop_back();
 
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
       // If all incoming values to the Phi are the same, we can replace the Phi
@@ -235,8 +238,8 @@
       if (Value *PNV = PN->hasConstantValue()) {
         if (Instruction *U = dyn_cast<Instruction>(PNV))
           Insts.insert(U);
-        PN->replaceAllUsesWith(PNV);
         SE->deleteValueFromRecords(PN);
+        PN->replaceAllUsesWith(PNV);
         PN->eraseFromParent();
         Changed = true;
         continue;
@@ -244,8 +247,8 @@
     }
 
     if (isInstructionTriviallyDead(I)) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (Instruction *U = dyn_cast<Instruction>(*i))
           Insts.insert(U);
       SE->deleteValueFromRecords(I);
       I->eraseFromParent();
@@ -287,24 +290,25 @@
 
   gep_type_iterator GTI = gep_type_begin(GEP);
   
-  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+       i != e; ++i, ++GTI) {
     // If this is a use of a recurrence that we can analyze, and it comes before
     // Op does in the GEP operand list, we will handle this when we process this
     // operand.
     if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
       const StructLayout *SL = TD->getStructLayout(STy);
-      unsigned Idx = cast<ConstantInt>(GEP->getOperand(i))->getZExtValue();
+      unsigned Idx = cast<ConstantInt>(*i)->getZExtValue();
       uint64_t Offset = SL->getElementOffset(Idx);
       GEPVal = SE->getAddExpr(GEPVal,
                              SE->getIntegerSCEV(Offset, UIntPtrTy));
     } else {
       unsigned GEPOpiBits = 
-        GEP->getOperand(i)->getType()->getPrimitiveSizeInBits();
+        (*i)->getType()->getPrimitiveSizeInBits();
       unsigned IntPtrBits = UIntPtrTy->getPrimitiveSizeInBits();
       Instruction::CastOps opcode = (GEPOpiBits < IntPtrBits ? 
           Instruction::SExt : (GEPOpiBits > IntPtrBits ? Instruction::Trunc :
             Instruction::BitCast));
-      Value *OpVal = getCastedVersionOf(opcode, GEP->getOperand(i));
+      Value *OpVal = getCastedVersionOf(opcode, *i);
       SCEVHandle Idx = SE->getSCEV(OpVal);
 
       uint64_t TypeSize = TD->getABITypeSize(GTI.getIndexedType());
@@ -375,7 +379,7 @@
 /// should use the post-inc value).
 static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
                                        Loop *L, DominatorTree *DT, Pass *P,
-                                       SmallPtrSet<Instruction*,16> &DeadInsts){
+                                       SetVector<Instruction*> &DeadInsts){
   // If the user is in the loop, use the preinc value.
   if (L->contains(User->getParent())) return false;
   
@@ -541,8 +545,9 @@
     // operands of Inst to use the new expression 'NewBase', with 'Imm' added
     // to it.
     void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                        Instruction *InsertPt,
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                       SmallPtrSet<Instruction*,16> &DeadInsts);
+                                       SetVector<Instruction*> &DeadInsts);
     
     Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
                                        SCEVExpander &Rewriter,
@@ -581,9 +586,8 @@
   }
   
   // If there is no immediate value, skip the next part.
-  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm))
-    if (SC->getValue()->isZero())
-      return Rewriter.expandCodeFor(NewBase, BaseInsertPt);
+  if (Imm->isZero())
+    return Rewriter.expandCodeFor(NewBase, BaseInsertPt);
 
   Value *Base = Rewriter.expandCodeFor(NewBase, BaseInsertPt);
 
@@ -601,10 +605,14 @@
 
 // Once we rewrite the code to insert the new IVs we want, update the
 // operands of Inst to use the new expression 'NewBase', with 'Imm' added
-// to it.
+// to it. NewBasePt is the last instruction which contributes to the
+// value of NewBase in the case that it's a diffferent instruction from
+// the PHI that NewBase is computed from, or null otherwise.
+//
 void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                               Instruction *NewBasePt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                      SmallPtrSet<Instruction*,16> &DeadInsts) {
+                                      SetVector<Instruction*> &DeadInsts) {
   if (!isa<PHINode>(Inst)) {
     // By default, insert code at the user instruction.
     BasicBlock::iterator InsertPt = Inst;
@@ -618,7 +626,11 @@
     // value will be pinned to live somewhere after the original computation.
     // In this case, we have to back off.
     if (!isUseOfPostIncrementedValue) {
-      if (Instruction *OpInst = dyn_cast<Instruction>(OperandValToReplace)) { 
+      if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
+        InsertPt = NewBasePt;
+        ++InsertPt;
+      } else if (Instruction *OpInst
+                 = dyn_cast<Instruction>(OperandValToReplace)) {
         InsertPt = OpInst;
         while (isa<PHINode>(InsertPt)) ++InsertPt;
       }
@@ -878,8 +890,7 @@
 
       SeparateSubExprs(SubExprs, SARE->getOperand(0), SE);
     }
-  } else if (!isa<SCEVConstant>(Expr) ||
-             !cast<SCEVConstant>(Expr)->getValue()->isZero()) {
+  } else if (!Expr->isZero()) {
     // Do not add zero.
     SubExprs.push_back(Expr);
   }
@@ -966,14 +977,6 @@
   return Result;
 }
 
-/// isZero - returns true if the scalar evolution expression is zero.
-///
-static bool isZero(const SCEVHandle &V) {
-  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
-    return SC->getValue()->isZero();
-  return false;
-}
-
 /// ValidStride - Check whether the given Scale is valid for all loads and 
 /// stores in UsersToProcess.
 ///
@@ -996,7 +999,7 @@
     TargetLowering::AddrMode AM;
     if (SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
       AM.BaseOffs = SC->getValue()->getSExtValue();
-    AM.HasBaseReg = HasBaseReg || !isZero(UsersToProcess[i].Base);
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
     AM.Scale = Scale;
 
     // If load[imm+r*scale] is illegal, bail out.
@@ -1056,7 +1059,7 @@
                IE = SI->second.IVs.end(); II != IE; ++II)
           // FIXME: Only handle base == 0 for now.
           // Only reuse previous IV if it would not require a type conversion.
-          if (isZero(II->Base) &&
+          if (II->Base->isZero() &&
               !RequiresTypeConversion(II->Base->getType(), Ty)) {
             IV = *II;
             return Scale;
@@ -1109,19 +1112,14 @@
         if (II->getOperand(1) == OperandVal)
           isAddress = true;
         break;
-      case Intrinsic::x86_sse2_loadh_pd:
-      case Intrinsic::x86_sse2_loadl_pd:
-        if (II->getOperand(2) == OperandVal)
-          isAddress = true;
-        break;
     }
   }
   return isAddress;
 }
 
 // CollectIVUsers - Transform our list of users and offsets to a bit more
-// complex table. In this new vector, each 'BasedUser' contains 'Base' the base
-// of the strided accessas well as the old information from Uses. We
+// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
+// of the strided accesses, as well as the old information from Uses. We
 // progressively move information from the Base field to the Imm field, until
 // we eventually have the full access expression to rewrite the use.
 SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
@@ -1225,7 +1223,7 @@
   // their value in a register and add it in for each use. This will take up
   // a register operand, which potentially restricts what stride values are
   // valid.
-  bool HaveCommonExprs = !isZero(CommonExprs);
+  bool HaveCommonExprs = !CommonExprs->isZero();
   
   // If all uses are addresses, check if it is possible to reuse an IV with a
   // stride that is a factor of this stride. And that the multiple is a number
@@ -1398,6 +1396,16 @@
 
       SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
 
+      // If we had to insert new instrutions for RewriteOp, we have to
+      // consider that they may not have been able to end up immediately
+      // next to RewriteOp, because non-PHI instructions may never precede
+      // PHI instructions in a block. In this case, remember where the last
+      // instruction was inserted so that if we're replacing a different
+      // PHI node, we can use the later point to expand the final
+      // RewriteExpr.
+      Instruction *NewBasePt = dyn_cast<Instruction>(RewriteOp);
+      if (RewriteOp == NewPHI) NewBasePt = 0;
+
       // Clear the SCEVExpander's expression map so that we are guaranteed
       // to have the code emitted where we expect it.
       Rewriter.clear();
@@ -1424,7 +1432,8 @@
         // Add BaseV to the PHI value if needed.
         RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV));
 
-      User.RewriteInstructionToUseNewBase(RewriteExpr, Rewriter, L, this,
+      User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt,
+                                          Rewriter, L, this,
                                           DeadInsts);
 
       // Mark old value we replaced as possibly dead, so that it is elminated
@@ -1577,8 +1586,8 @@
         ? UIntPtrTy->getPrimitiveSizeInBits()
         : NewCmpTy->getPrimitiveSizeInBits();
       if (RequiresTypeConversion(NewCmpTy, CmpTy)) {
-        // Check if it is possible to rewrite it using a iv / stride of a smaller
-        // integer type.
+        // Check if it is possible to rewrite it using
+        // an iv / stride of a smaller integer type.
         bool TruncOk = false;
         if (NewCmpTy->isInteger()) {
           unsigned Bits = NewTyBits;
@@ -1610,7 +1619,7 @@
       // Avoid rewriting the compare instruction with an iv of new stride
       // if it's likely the new stride uses will be rewritten using the
       if (AllUsesAreAddresses &&
-          ValidStride(!isZero(CommonExprs), Scale, UsersToProcess)) {        
+          ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess)) {
         NewCmpVal = CmpVal;
         continue;
       }
@@ -1625,6 +1634,18 @@
     }
   }
 
+  // Forgo this transformation if it the increment happens to be
+  // unfortunately positioned after the condition, and the condition
+  // has multiple uses which prevent it from being moved immediately
+  // before the branch. See
+  // test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-*.ll
+  // for an example of this situation.
+  if (!Cond->hasOneUse())
+    for (BasicBlock::iterator I = Cond, E = Cond->getParent()->end();
+         I != E; ++I)
+      if (I == NewIncV)
+        return Cond;
+
   if (NewCmpVal != CmpVal) {
     // Create a new compare instruction using new stride / iv.
     ICmpInst *OldCond = Cond;
@@ -1636,14 +1657,14 @@
       RHS = SCEVExpander::InsertCastOfTo(Instruction::IntToPtr, RHS, NewCmpTy);
     }
     // Insert new compare instruction.
-    Cond = new ICmpInst(Predicate, NewIncV, RHS);
-    Cond->setName(L->getHeader()->getName() + ".termcond");
-    OldCond->getParent()->getInstList().insert(OldCond, Cond);
+    Cond = new ICmpInst(Predicate, NewIncV, RHS,
+                        L->getHeader()->getName() + ".termcond",
+                        OldCond);
 
     // Remove the old compare instruction. The old indvar is probably dead too.
     DeadInsts.insert(cast<Instruction>(CondUse->OperandValToReplace));
-    OldCond->replaceAllUsesWith(Cond);
     SE->deleteValueFromRecords(OldCond);
+    OldCond->replaceAllUsesWith(Cond);
     OldCond->eraseFromParent();
 
     IVUsesByStride[*CondStride].Users.pop_back();
@@ -1761,7 +1782,7 @@
 #endif
 
   // IVsByStride keeps IVs for one particular loop.
-  IVsByStride.clear();
+  assert(IVsByStride.empty() && "Stale entries in IVsByStride?");
 
   // Sort the StrideOrder so we process larger strides first.
   std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare());
@@ -1778,36 +1799,39 @@
     StrengthReduceStridedIVUsers(SI->first, SI->second, L, HasOneStride);
   }
 
+  // We're done analyzing this loop; release all the state we built up for it.
+  CastedPointers.clear();
+  IVUsesByStride.clear();
+  IVsByStride.clear();
+  StrideOrder.clear();
+
   // Clean up after ourselves
   if (!DeadInsts.empty()) {
     DeleteTriviallyDeadInstructions(DeadInsts);
 
     BasicBlock::iterator I = L->getHeader()->begin();
-    PHINode *PN;
-    while ((PN = dyn_cast<PHINode>(I))) {
-      ++I;  // Preincrement iterator to avoid invalidating it when deleting PN.
-
-      // At this point, we know that we have killed one or more GEP
-      // instructions.  It is worth checking to see if the cann indvar is also
-      // dead, so that we can remove it as well.  The requirements for the cann
-      // indvar to be considered dead are:
-      // 1. the cann indvar has one use
-      // 2. the use is an add instruction
-      // 3. the add has one use
-      // 4. the add is used by the cann indvar
-      // If all four cases above are true, then we can remove both the add and
-      // the cann indvar.
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      // At this point, we know that we have killed one or more IV users.
+      // It is worth checking to see if the cann indvar is also
+      // dead, so that we can remove it as well.
+      //
+      // We can remove a PHI if it is on a cycle in the def-use graph
+      // where each node in the cycle has degree one, i.e. only one use,
+      // and is an instruction with no side effects.
+      //
       // FIXME: this needs to eliminate an induction variable even if it's being
       // compared against some value to decide loop termination.
       if (PN->hasOneUse()) {
-        Instruction *BO = dyn_cast<Instruction>(*PN->use_begin());
-        if (BO && (isa<BinaryOperator>(BO) || isa<CmpInst>(BO))) {
-          if (BO->hasOneUse() && PN == *(BO->use_begin())) {
-            DeadInsts.insert(BO);
-            // Break the cycle, then delete the PHI.
-            PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+        for (Instruction *J = dyn_cast<Instruction>(*PN->use_begin());
+             J && J->hasOneUse() && !J->mayWriteToMemory();
+             J = dyn_cast<Instruction>(*J->use_begin())) {
+          // If we find the original PHI, we've discovered a cycle.
+          if (J == PN) {
+            // Break the cycle and mark the PHI for deletion.
             SE->deleteValueFromRecords(PN);
-            PN->eraseFromParent();
+            PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+            DeadInsts.insert(PN);
+            break;
           }
         }
       }
@@ -1815,8 +1839,5 @@
     DeleteTriviallyDeadInstructions(DeadInsts);
   }
 
-  CastedPointers.clear();
-  IVUsesByStride.clear();
-  StrideOrder.clear();
   return false;
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnroll.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnroll.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnroll.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnroll.cpp Sun Jul  6 15:45:41 2008
@@ -10,53 +10,31 @@
 // This pass implements a simple loop unroller.  It works best when loops have
 // been canonicalized by the -indvars pass, allowing it to determine the trip
 // counts of loops easily.
-//
-// This pass will multi-block loops only if they contain no non-unrolled 
-// subloops.  The process of unrolling can produce extraneous basic blocks 
-// linked with unconditional branches.  This will be corrected in the future.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-unroll"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IntrinsicInst.h"
-#include <algorithm>
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <climits>
-#include <cstdio>
+
 using namespace llvm;
 
-STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
-STATISTIC(NumUnrolled,    "Number of loops unrolled (completely or otherwise)");
+static cl::opt<unsigned>
+UnrollThreshold("unroll-threshold", cl::init(100), cl::Hidden,
+  cl::desc("The cut-off point for automatic loop unrolling"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::init(0), cl::Hidden,
+  cl::desc("Use this unroll count for all loops, for testing purposes"));
 
 namespace {
-  cl::opt<unsigned>
-  UnrollThreshold
-    ("unroll-threshold", cl::init(100), cl::Hidden,
-     cl::desc("The cut-off point for automatic loop unrolling"));
-
-  cl::opt<unsigned>
-  UnrollCount
-    ("unroll-count", cl::init(0), cl::Hidden,
-     cl::desc("Use this unroll count for all loops, for testing purposes"));
-
   class VISIBILITY_HIDDEN LoopUnroll : public LoopPass {
-    LoopInfo *LI;  // The current loop information
   public:
     static char ID; // Pass ID, replacement for typeid
     LoopUnroll() : LoopPass((intptr_t)&ID) {}
@@ -67,8 +45,6 @@
     static const unsigned NoThreshold = UINT_MAX;
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
-    bool unrollLoop(Loop *L, unsigned Count, unsigned Threshold);
-    BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB);
 
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG...
@@ -79,19 +55,27 @@
       AU.addRequired<LoopInfo>();
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<LoopInfo>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
     }
   };
-  char LoopUnroll::ID = 0;
-  RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
 }
 
+char LoopUnroll::ID = 0;
+static RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
+
 LoopPass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L) {
   unsigned Size = 0;
-  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
-    BasicBlock *BB = L->getBlocks()[i];
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
     Instruction *Term = BB->getTerminator();
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
       if (isa<PHINode>(I) && BB == L->getHeader()) {
@@ -122,136 +106,18 @@
   return Size;
 }
 
-// RemapInstruction - Convert the instruction operands from referencing the
-// current values into those specified by ValueMap.
-//
-static inline void RemapInstruction(Instruction *I,
-                                    DenseMap<const Value *, Value*> &ValueMap) {
-  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
-    Value *Op = I->getOperand(op);
-    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
-    if (It != ValueMap.end()) Op = It->second;
-    I->setOperand(op, Op);
-  }
-}
-
-// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
-// only has one predecessor, and that predecessor only has one successor.
-// Returns the new combined block.
-BasicBlock *LoopUnroll::FoldBlockIntoPredecessor(BasicBlock *BB) {
-  // Merge basic blocks into their predecessor if there is only one distinct
-  // pred, and if there is only one distinct successor of the predecessor, and
-  // if there are no PHI nodes.
-  //
-  BasicBlock *OnlyPred = BB->getSinglePredecessor();
-  if (!OnlyPred) return 0;
-
-  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
-    return 0;
-
-  DOUT << "Merging: " << *BB << "into: " << *OnlyPred;
-
-  // Resolve any PHI nodes at the start of the block.  They are all
-  // guaranteed to have exactly one entry if they exist, unless there are
-  // multiple duplicate (but guaranteed to be equal) entries for the
-  // incoming edges.  This occurs when there are multiple edges from
-  // OnlyPred to OnlySucc.
-  //
-  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
-    PN->replaceAllUsesWith(PN->getIncomingValue(0));
-    BB->getInstList().pop_front();  // Delete the phi node...
-  }
-
-  // Delete the unconditional branch from the predecessor...
-  OnlyPred->getInstList().pop_back();
-
-  // Move all definitions in the successor to the predecessor...
-  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
-
-  // Make all PHI nodes that referred to BB now refer to Pred as their
-  // source...
-  BB->replaceAllUsesWith(OnlyPred);
-
-  std::string OldName = BB->getName();
-
-  // Erase basic block from the function...
-  LI->removeBlock(BB);
-  BB->eraseFromParent();
-
-  // Inherit predecessor's name if it exists...
-  if (!OldName.empty() && !OnlyPred->hasName())
-    OnlyPred->setName(OldName);
-
-  return OnlyPred;
-}
-
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
-  LI = &getAnalysis<LoopInfo>();
-
-  // Unroll the loop.
-  if (!unrollLoop(L, UnrollCount, UnrollThreshold))
-    return false;
-
-  // Update the loop information for this loop.
-  // If we completely unrolled the loop, remove it from the parent.
-  if (L->getNumBackEdges() == 0)
-    LPM.deleteLoopFromQueue(L);
-
-  return true;
-}
-
-/// Unroll the given loop by UnrollCount, or by a heuristically-determined
-/// value if Count is zero. If Threshold is not NoThreshold, it is a value
-/// to limit code size expansion. If the loop size would expand beyond the
-/// threshold value, unrolling is suppressed. The return value is true if
-/// any transformations are performed.
-///
-bool LoopUnroll::unrollLoop(Loop *L, unsigned Count, unsigned Threshold) {
   assert(L->isLCSSAForm());
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
 
   BasicBlock *Header = L->getHeader();
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
   DOUT << "Loop Unroll: F[" << Header->getParent()->getName()
        << "] Loop %" << Header->getName() << "\n";
 
-  if (!BI || BI->isUnconditional()) {
-    // The loop-rotate pass can be helpful to avoid this in many cases.
-    DOUT << "  Can't unroll; loop not terminated by a conditional branch.\n";
-    return false;
-  }
-
-  // Determine the trip count and/or trip multiple. A TripCount value of zero
-  // is used to mean an unknown trip count. The TripMultiple value is the
-  // greatest known integer multiple of the trip count.
-  unsigned TripCount = 0;
-  unsigned TripMultiple = 1;
-  if (Value *TripCountValue = L->getTripCount()) {
-    if (ConstantInt *TripCountC = dyn_cast<ConstantInt>(TripCountValue)) {
-      // Guard against huge trip counts. This also guards against assertions in
-      // APInt from the use of getZExtValue, below.
-      if (TripCountC->getValue().getActiveBits() <= 32) {
-        TripCount = (unsigned)TripCountC->getZExtValue();
-      }
-    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TripCountValue)) {
-      switch (BO->getOpcode()) {
-      case BinaryOperator::Mul:
-        if (ConstantInt *MultipleC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
-          if (MultipleC->getValue().getActiveBits() <= 32) {
-            TripMultiple = (unsigned)MultipleC->getZExtValue();
-          }
-        }
-        break;
-      default: break;
-      }
-    }
-  }
-  if (TripCount != 0)
-    DOUT << "  Trip Count = " << TripCount << "\n";
-  if (TripMultiple != 1)
-    DOUT << "  Trip Multiple = " << TripMultiple << "\n";
-
+  // Find trip count
+  unsigned TripCount = L->getSmallConstantTripCount();
+  unsigned Count = UnrollCount;
+ 
   // Automatically select an unroll count.
   if (Count == 0) {
     // Conservative heuristic: if we know the trip count, see if we can
@@ -264,249 +130,30 @@
     }
   }
 
-  // Effectively "DCE" unrolled iterations that are beyond the tripcount
-  // and will never be executed.
-  if (TripCount != 0 && Count > TripCount)
-    Count = TripCount;
-
-  assert(Count > 0);
-  assert(TripMultiple > 0);
-  assert(TripCount == 0 || TripCount % TripMultiple == 0);
-
   // Enforce the threshold.
-  if (Threshold != NoThreshold) {
+  if (UnrollThreshold != NoThreshold) {
     unsigned LoopSize = ApproximateLoopSize(L);
     DOUT << "  Loop Size = " << LoopSize << "\n";
     uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 && Size > Threshold) {
+    if (TripCount != 1 && Size > UnrollThreshold) {
       DOUT << "  TOO LARGE TO UNROLL: "
-           << Size << ">" << Threshold << "\n";
+           << Size << ">" << UnrollThreshold << "\n";
       return false;
     }
   }
 
-  // Are we eliminating the loop control altogether?
-  bool CompletelyUnroll = Count == TripCount;
-
-  // If we know the trip count, we know the multiple...
-  unsigned BreakoutTrip = 0;
-  if (TripCount != 0) {
-    BreakoutTrip = TripCount % Count;
-    TripMultiple = 0;
-  } else {
-    // Figure out what multiple to use.
-    BreakoutTrip = TripMultiple =
-      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
-  }
-
-  if (CompletelyUnroll) {
-    DOUT << "COMPLETELY UNROLLING loop %" << Header->getName()
-         << " with trip count " << TripCount << "!\n";
-  } else {
-    DOUT << "UNROLLING loop %" << Header->getName()
-         << " by " << Count;
-    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
-      DOUT << " with a breakout at trip " << BreakoutTrip;
-    } else if (TripMultiple != 1) {
-      DOUT << " with " << TripMultiple << " trips per branch";
-    }
-    DOUT << "!\n";
-  }
-
-  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
-
-  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
-  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
-
-  // For the first iteration of the loop, we should use the precloned values for
-  // PHI nodes.  Insert associations now.
-  typedef DenseMap<const Value*, Value*> ValueMapTy;
-  ValueMapTy LastValueMap;
-  std::vector<PHINode*> OrigPHINode;
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    OrigPHINode.push_back(PN);
-    if (Instruction *I = 
-                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
-      if (L->contains(I->getParent()))
-        LastValueMap[I] = I;
-  }
-
-  std::vector<BasicBlock*> Headers;
-  std::vector<BasicBlock*> Latches;
-  Headers.push_back(Header);
-  Latches.push_back(LatchBlock);
-
-  for (unsigned It = 1; It != Count; ++It) {
-    char SuffixBuffer[100];
-    sprintf(SuffixBuffer, ".%d", It);
-    
-    std::vector<BasicBlock*> NewBlocks;
-    
-    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
-         E = LoopBlocks.end(); BB != E; ++BB) {
-      ValueMapTy ValueMap;
-      BasicBlock *New = CloneBasicBlock(*BB, ValueMap, SuffixBuffer);
-      Header->getParent()->getBasicBlockList().push_back(New);
-
-      // Loop over all of the PHI nodes in the block, changing them to use the
-      // incoming values from the previous block.
-      if (*BB == Header)
-        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-          PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
-          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
-          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
-            if (It > 1 && L->contains(InValI->getParent()))
-              InVal = LastValueMap[InValI];
-          ValueMap[OrigPHINode[i]] = InVal;
-          New->getInstList().erase(NewPHI);
-        }
-
-      // Update our running map of newest clones
-      LastValueMap[*BB] = New;
-      for (ValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end();
-           VI != VE; ++VI)
-        LastValueMap[VI->first] = VI->second;
-
-      L->addBasicBlockToLoop(New, LI->getBase());
-
-      // Add phi entries for newly created values to all exit blocks except
-      // the successor of the latch block.  The successor of the exit block will
-      // be updated specially after unrolling all the way.
-      if (*BB != LatchBlock)
-        for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end();
-             UI != UE;) {
-          Instruction *UseInst = cast<Instruction>(*UI);
-          ++UI;
-          if (isa<PHINode>(UseInst) && !L->contains(UseInst->getParent())) {
-            PHINode *phi = cast<PHINode>(UseInst);
-            Value *Incoming = phi->getIncomingValueForBlock(*BB);
-            phi->addIncoming(Incoming, New);
-          }
-        }
-
-      // Keep track of new headers and latches as we create them, so that
-      // we can insert the proper branches later.
-      if (*BB == Header)
-        Headers.push_back(New);
-      if (*BB == LatchBlock) {
-        Latches.push_back(New);
-
-        // Also, clear out the new latch's back edge so that it doesn't look
-        // like a new loop, so that it's amenable to being merged with adjacent
-        // blocks later on.
-        TerminatorInst *Term = New->getTerminator();
-        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
-        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
-        Term->setSuccessor(!ContinueOnTrue, NULL);
-      }
-
-      NewBlocks.push_back(New);
-    }
-    
-    // Remap all instructions in the most recent iteration
-    for (unsigned i = 0; i < NewBlocks.size(); ++i) {
-      BasicBlock *NB = NewBlocks[i];
-      if (BasicBlock *UnwindDest = NB->getUnwindDest())
-        NB->setUnwindDest(cast<BasicBlock>(LastValueMap[UnwindDest]));
-
-      for (BasicBlock::iterator I = NB->begin(), E = NB->end(); I != E; ++I)
-        RemapInstruction(I, LastValueMap);
-    }
-  }
-  
-  // The latch block exits the loop.  If there are any PHI nodes in the
-  // successor blocks, update them to use the appropriate values computed as the
-  // last iteration of the loop.
-  if (Count != 1) {
-    SmallPtrSet<PHINode*, 8> Users;
-    for (Value::use_iterator UI = LatchBlock->use_begin(),
-         UE = LatchBlock->use_end(); UI != UE; ++UI)
-      if (PHINode *phi = dyn_cast<PHINode>(*UI))
-        Users.insert(phi);
-    
-    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
-    for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end();
-         SI != SE; ++SI) {
-      PHINode *PN = *SI;
-      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
-      // If this value was defined in the loop, take the value defined by the
-      // last iteration of the loop.
-      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
-        if (L->contains(InValI->getParent()))
-          InVal = LastValueMap[InVal];
-      }
-      PN->addIncoming(InVal, LastIterationBB);
-    }
-  }
-
-  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
-  // original block, setting them to their incoming values.
-  if (CompletelyUnroll) {
-    BasicBlock *Preheader = L->getLoopPreheader();
-    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-      PHINode *PN = OrigPHINode[i];
-      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
-      Header->getInstList().erase(PN);
-    }
-  }
-
-  // Now that all the basic blocks for the unrolled iterations are in place,
-  // set up the branches to connect them.
-  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-    // The original branch was replicated in each unrolled iteration.
-    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
-
-    // The branch destination.
-    unsigned j = (i + 1) % e;
-    BasicBlock *Dest = Headers[j];
-    bool NeedConditional = true;
-
-    // For a complete unroll, make the last iteration end with a branch
-    // to the exit block.
-    if (CompletelyUnroll && j == 0) {
-      Dest = LoopExit;
-      NeedConditional = false;
-    }
-
-    // If we know the trip count or a multiple of it, we can safely use an
-    // unconditional branch for some iterations.
-    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
-      NeedConditional = false;
-    }
+  // Unroll the loop.
+  Function *F = L->getHeader()->getParent();
+  if (!UnrollLoop(L, Count, LI, &LPM))
+    return false;
 
-    if (NeedConditional) {
-      // Update the conditional branch's successor for the following
-      // iteration.
-      Term->setSuccessor(!ContinueOnTrue, Dest);
-    } else {
-      Term->setUnconditionalDest(Dest);
-      // Merge adjacent basic blocks, if possible.
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest)) {
-        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
-        std::replace(Headers.begin(), Headers.end(), Dest, Fold);
-      }
-    }
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  DominatorTree *DT = getAnalysisToUpdate<DominatorTree>();
+  if (DT) {
+    DT->runOnFunction(*F);
+    DominanceFrontier *DF = getAnalysisToUpdate<DominanceFrontier>();
+    if (DF)
+      DF->runOnFunction(*F);
   }
-  
-  // At this point, the code is well formed.  We now do a quick sweep over the
-  // inserted code, doing constant propagation and dead code elimination as we
-  // go.
-  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
-  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
-       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
-      Instruction *Inst = I++;
-
-      if (isInstructionTriviallyDead(Inst))
-        (*BB)->getInstList().erase(Inst);
-      else if (Constant *C = ConstantFoldInstruction(Inst)) {
-        Inst->replaceAllUsesWith(C);
-        (*BB)->getInstList().erase(Inst);
-      }
-    }
-
-  NumCompletelyUnrolled += CompletelyUnroll;
-  ++NumUnrolled;
   return true;
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnswitch.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnswitch.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnswitch.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/LoopUnswitch.cpp Sun Jul  6 15:45:41 2008
@@ -54,11 +54,11 @@
 STATISTIC(NumTrivial , "Number of unswitches that are trivial");
 STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
 
-namespace {
-  cl::opt<unsigned>
-  Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
-            cl::init(10), cl::Hidden);
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(10), cl::Hidden);
   
+namespace {
   class VISIBILITY_HIDDEN LoopUnswitch : public LoopPass {
     LoopInfo *LI;  // Loop information
     LPPassManager *LPM;
@@ -71,24 +71,28 @@
     bool OptimizeForSize;
     bool redoLoop;
 
+    Loop *currentLoop;
     DominanceFrontier *DF;
     DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
     
-    /// LoopDF - Loop's dominance frontier. This set is a collection of 
-    /// loop exiting blocks' DF member blocks. However this does set does not
-    /// includes basic blocks that are inside loop.
-    SmallPtrSet<BasicBlock *, 8> LoopDF;
-
-    /// OrigLoopExitMap - This is used to map loop exiting block with 
-    /// corresponding loop exit block, before updating CFG.
-    DenseMap<BasicBlock *, BasicBlock *> OrigLoopExitMap;
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the 
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
   public:
     static char ID; // Pass ID, replacement for typeid
     explicit LoopUnswitch(bool Os = false) : 
-      LoopPass((intptr_t)&ID), OptimizeForSize(Os), redoLoop(false) {}
+      LoopPass((intptr_t)&ID), OptimizeForSize(Os), redoLoop(false), 
+      currentLoop(NULL), DF(NULL), DT(NULL), loopHeader(NULL),
+      loopPreheader(NULL) {}
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
-    bool processLoop(Loop *L);
+    bool processCurrentLoop();
 
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG...
@@ -115,18 +119,17 @@
         LoopProcessWorklist.erase(I);
     }
 
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
     /// Split all of the edges from inside the loop to their exit blocks.
     /// Update the appropriate Phi nodes as we do so.
-    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks,
-                        SmallVector<BasicBlock *, 8> &MiddleBlocks);
+    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
 
-    /// If BB's dominance frontier  has a member that is not part of loop L then
-    /// remove it. Add NewDFMember in BB's dominance frontier.
-    void ReplaceLoopExternalDFMember(Loop *L, BasicBlock *BB,
-                                     BasicBlock *NewDFMember);
-      
-    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,Loop *L);
-    unsigned getLoopUnswitchCost(Loop *L, Value *LIC);
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
+    unsigned getLoopUnswitchCost(Value *LIC);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                   BasicBlock *ExitBlock);
     void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
@@ -143,10 +146,13 @@
     void RemoveBlockIfDead(BasicBlock *BB,
                            std::vector<Instruction*> &Worklist, Loop *l);
     void RemoveLoopFromHierarchy(Loop *L);
+    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
+                                    BasicBlock **LoopExit = 0);
+
   };
-  char LoopUnswitch::ID = 0;
-  RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
 }
+char LoopUnswitch::ID = 0;
+static RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
 
 LoopPass *llvm::createLoopUnswitchPass(bool Os) { 
   return new LoopUnswitch(Os); 
@@ -183,26 +189,27 @@
   LPM = &LPM_Ref;
   DF = getAnalysisToUpdate<DominanceFrontier>();
   DT = getAnalysisToUpdate<DominatorTree>();
-
+  currentLoop = L;
   bool Changed = false;
-
   do {
+    assert(currentLoop->isLCSSAForm());
     redoLoop = false;
-    Changed |= processLoop(L);
+    Changed |= processCurrentLoop();
   } while(redoLoop);
 
   return Changed;
 }
 
-/// processLoop - Do actual work and unswitch loop if possible and profitable.
-bool LoopUnswitch::processLoop(Loop *L) {
-  assert(L->isLCSSAForm());
+/// processCurrentLoop - Do actual work and unswitch loop if possible 
+/// and profitable.
+bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
        I != E; ++I) {
     TerminatorInst *TI = (*I)->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -211,15 +218,17 @@
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), L, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(),
-                                             L)) {
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
           ++NumBranches;
           return true;
         }
       }      
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), L, Changed);
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                             currentLoop, Changed);
       if (LoopCond && SI->getNumCases() > 1) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
@@ -228,7 +237,7 @@
         if (!UnswitchedVals.insert(UnswitchVal))
           continue;
 
-        if (UnswitchIfProfitable(LoopCond, UnswitchVal, L)) {
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
           return true;
         }
@@ -239,17 +248,15 @@
     for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), L, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(),
-                                             L)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
           ++NumSelects;
           return true;
         }
       }
   }
-  
-  assert(L->isLCSSAForm());
-  
   return Changed;
 }
 
@@ -314,9 +321,9 @@
 /// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
 /// Cond == Val.
 ///
-static bool IsTrivialUnswitchCondition(Loop *L, Value *Cond, Constant **Val = 0,
-                                       BasicBlock **LoopExit = 0) {
-  BasicBlock *Header = L->getHeader();
+bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
+                                       BasicBlock **LoopExit) {
+  BasicBlock *Header = currentLoop->getHeader();
   TerminatorInst *HeaderTerm = Header->getTerminator();
   
   BasicBlock *LoopExitBB = 0;
@@ -330,9 +337,11 @@
     // latch block or exit through a one exit block without having any 
     // side-effects.  If so, determine the value of Cond that causes it to do
     // this.
-    if ((LoopExitBB = isTrivialLoopExitBlock(L, BI->getSuccessor(0)))) {
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                             BI->getSuccessor(0)))) {
       if (Val) *Val = ConstantInt::getTrue();
-    } else if ((LoopExitBB = isTrivialLoopExitBlock(L, BI->getSuccessor(1)))) {
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                                    BI->getSuccessor(1)))) {
       if (Val) *Val = ConstantInt::getFalse();
     }
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
@@ -344,7 +353,8 @@
     // side-effects.  If so, determine the value of Cond that causes it to do
     // this.  Note that we can't trivially unswitch on the default case.
     for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
-      if ((LoopExitBB = isTrivialLoopExitBlock(L, SI->getSuccessor(i)))) {
+      if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                               SI->getSuccessor(i)))) {
         // Okay, we found a trivial case, remember the value that is trivial.
         if (Val) *Val = SI->getCaseValue(i);
         break;
@@ -370,24 +380,25 @@
 }
 
 /// getLoopUnswitchCost - Return the cost (code size growth) that will happen if
-/// we choose to unswitch the specified loop on the specified value.
+/// we choose to unswitch current loop on the specified value.
 ///
-unsigned LoopUnswitch::getLoopUnswitchCost(Loop *L, Value *LIC) {
+unsigned LoopUnswitch::getLoopUnswitchCost(Value *LIC) {
   // If the condition is trivial, always unswitch.  There is no code growth for
   // this case.
-  if (IsTrivialUnswitchCondition(L, LIC))
+  if (IsTrivialUnswitchCondition(LIC))
     return 0;
   
   // FIXME: This is really overly conservative.  However, more liberal 
   // estimations have thus far resulted in excessive unswitching, which is bad
   // both in compile time and in code size.  This should be replaced once
   // someone figures out how a good estimation.
-  return L->getBlocks().size();
+  return currentLoop->getBlocks().size();
   
   unsigned Cost = 0;
   // FIXME: this is brain dead.  It should take into consideration code
   // shrinkage.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
        I != E; ++I) {
     BasicBlock *BB = *I;
     // Do not include empty blocks in the cost calculation.  This happen due to
@@ -402,12 +413,12 @@
   return Cost;
 }
 
-/// UnswitchIfProfitable - We have found that we can unswitch L when
+/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
 /// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
-bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,Loop *L){
-  // Check to see if it would be profitable to unswitch this loop.
-  unsigned Cost = getLoopUnswitchCost(L, LoopCond);
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
+  // Check to see if it would be profitable to unswitch current loop.
+  unsigned Cost = getLoopUnswitchCost(LoopCond);
 
   // Do not do non-trivial unswitch while optimizing for size.
   if (Cost && OptimizeForSize)
@@ -418,21 +429,27 @@
     // resultant unswitched loops.
     //
     DOUT << "NOT unswitching loop %"
-         << L->getHeader()->getName() << ", cost too high: "
-         << L->getBlocks().size() << "\n";
+         << currentLoop->getHeader()->getName() << ", cost too high: "
+         << currentLoop->getBlocks().size() << "\n";
     return false;
   }
-  
-  // If this is a trivial condition to unswitch (which results in no code
-  // duplication), do it now.
+
+  initLoopData();
+
   Constant *CondVal;
   BasicBlock *ExitBlock;
-  if (IsTrivialUnswitchCondition(L, LoopCond, &CondVal, &ExitBlock)) {
-    UnswitchTrivialCondition(L, LoopCond, CondVal, ExitBlock);
+  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock);
   } else {
-    UnswitchNontrivialCondition(LoopCond, Val, L);
+    UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
   }
- 
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  Function *F = loopHeader->getParent();
+  if (DT)
+    DT->runOnFunction(*F);
+  if (DF)
+    DF->runOnFunction(*F);
   return true;
 }
 
@@ -449,87 +466,6 @@
   }
 }
 
-// CloneDomInfo - NewBB is cloned from Orig basic block. Now clone Dominator
-// Info.
-//
-// If Orig block's immediate dominator is mapped in VM then use corresponding
-// immediate dominator from the map. Otherwise Orig block's dominator is also
-// NewBB's dominator.
-//
-// OrigPreheader is loop pre-header before this pass started
-// updating CFG. NewPrehader is loops new pre-header. However, after CFG
-// manipulation, loop L may not exist. So rely on input parameter NewPreheader.
-void CloneDomInfo(BasicBlock *NewBB, BasicBlock *Orig, 
-                  BasicBlock *NewPreheader, BasicBlock *OrigPreheader, 
-                  BasicBlock *OrigHeader,
-                  DominatorTree *DT, DominanceFrontier *DF,
-                  DenseMap<const Value*, Value*> &VM) {
-
-  // If NewBB alreay has found its place in domiantor tree then no need to do
-  // anything.
-  if (DT->getNode(NewBB))
-    return;
-
-  // If Orig does not have any immediate domiantor then its clone, NewBB, does 
-  // not need any immediate dominator.
-  DomTreeNode *OrigNode = DT->getNode(Orig);
-  if (!OrigNode)
-    return;
-  DomTreeNode *OrigIDomNode = OrigNode->getIDom();
-  if (!OrigIDomNode)
-    return;
-
-  BasicBlock *OrigIDom = NULL; 
-
-  // If Orig is original loop header then its immediate dominator is
-  // NewPreheader.
-  if (Orig == OrigHeader)
-    OrigIDom = NewPreheader;
-
-  // If Orig is new pre-header then its immediate dominator is
-  // original pre-header.
-  else if (Orig == NewPreheader)
-    OrigIDom = OrigPreheader;
-
-  // Other as DT to find Orig's immediate dominator.
-  else
-     OrigIDom = OrigIDomNode->getBlock();
-
-  // Initially use Orig's immediate dominator as NewBB's immediate dominator.
-  BasicBlock *NewIDom = OrigIDom;
-  DenseMap<const Value*, Value*>::iterator I = VM.find(OrigIDom);
-  if (I != VM.end()) {
-    NewIDom = cast<BasicBlock>(I->second);
-    
-    // If NewIDom does not have corresponding dominatore tree node then
-    // get one.
-    if (!DT->getNode(NewIDom))
-      CloneDomInfo(NewIDom, OrigIDom, NewPreheader, OrigPreheader, 
-                   OrigHeader, DT, DF, VM);
-  }
-  
-  DT->addNewBlock(NewBB, NewIDom);
-  
-  // Copy cloned dominance frontiner set
-  DominanceFrontier::DomSetType NewDFSet;
-  if (DF) {
-    DominanceFrontier::iterator DFI = DF->find(Orig);
-    if ( DFI != DF->end()) {
-      DominanceFrontier::DomSetType S = DFI->second;
-      for (DominanceFrontier::DomSetType::iterator I = S.begin(), E = S.end();
-           I != E; ++I) {
-        BasicBlock *BB = *I;
-        DenseMap<const Value*, Value*>::iterator IDM = VM.find(BB);
-        if (IDM != VM.end())
-          NewDFSet.insert(cast<BasicBlock>(IDM->second));
-        else
-          NewDFSet.insert(BB);
-      }
-    }
-    DF->addBasicBlock(NewBB, NewDFSet);
-  }
-}
-
 /// CloneLoop - Recursively clone the specified loop and all of its children,
 /// mapping the blocks with the specified map.
 static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
@@ -569,10 +505,8 @@
 
   // Insert the new branch.
   BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
-
 }
 
-
 /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
 /// condition in it (a cond branch from its header block to its latch block,
 /// where the path through the loop that doesn't execute its body has no 
@@ -582,15 +516,14 @@
                                             Constant *Val, 
                                             BasicBlock *ExitBlock) {
   DOUT << "loop-unswitch: Trivial-Unswitch loop %"
-       << L->getHeader()->getName() << " [" << L->getBlocks().size()
+       << loopHeader->getName() << " [" << L->getBlocks().size()
        << " blocks] in Function " << L->getHeader()->getParent()->getName()
        << " on cond: " << *Val << " == " << *Cond << "\n";
   
   // First step, split the preheader, so that we know that there is a safe place
-  // to insert the conditional branch.  We will change 'OrigPH' to have a
+  // to insert the conditional branch.  We will change loopPreheader to have a
   // conditional branch on Cond.
-  BasicBlock *OrigPH = L->getLoopPreheader();
-  BasicBlock *NewPH = SplitEdge(OrigPH, L->getHeader(), this);
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
 
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we should
@@ -606,9 +539,9 @@
   // Okay, now we have a position to branch from and a position to branch to, 
   // insert the new conditional branch.
   EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
-                                 OrigPH->getTerminator());
-  LPM->deleteSimpleAnalysisValue(OrigPH->getTerminator(), L);
-  OrigPH->getTerminator()->eraseFromParent();
+                                 loopPreheader->getTerminator());
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
 
   // We need to reprocess this loop, it could be unswitched again.
   redoLoop = true;
@@ -620,94 +553,52 @@
   ++NumTrivial;
 }
 
-/// ReplaceLoopExternalDFMember -
-/// If BB's dominance frontier  has a member that is not part of loop L then 
-/// remove it. Add NewDFMember in BB's dominance frontier.
-void LoopUnswitch::ReplaceLoopExternalDFMember(Loop *L, BasicBlock *BB,
-                                               BasicBlock *NewDFMember) {
-  
-  DominanceFrontier::iterator DFI = DF->find(BB);
-  if (DFI == DF->end())
-    return;
-  
-  DominanceFrontier::DomSetType &DFSet = DFI->second;
-  for (DominanceFrontier::DomSetType::iterator DI = DFSet.begin(),
-         DE = DFSet.end(); DI != DE;) {
-    BasicBlock *B = *DI++;
-    if (L->contains(B))
-      continue;
-
-    DF->removeFromFrontier(DFI, B);
-    LoopDF.insert(B);
-  }
-
-  DF->addToFrontier(DFI, NewDFMember);
-}
-
 /// SplitExitEdges - Split all of the edges from inside the loop to their exit
 /// blocks.  Update the appropriate Phi nodes as we do so.
 void LoopUnswitch::SplitExitEdges(Loop *L, 
-                                 const SmallVector<BasicBlock *, 8> &ExitBlocks,
-                                  SmallVector<BasicBlock *, 8> &MiddleBlocks) {
+                                const SmallVector<BasicBlock *, 8> &ExitBlocks) 
+{
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *ExitBlock = ExitBlocks[i];
     std::vector<BasicBlock*> Preds(pred_begin(ExitBlock), pred_end(ExitBlock));
 
     for (unsigned j = 0, e = Preds.size(); j != e; ++j) {
-      BasicBlock* MiddleBlock = SplitEdge(Preds[j], ExitBlock, this);
-      MiddleBlocks.push_back(MiddleBlock);
+      BasicBlock* NewExitBlock = SplitEdge(Preds[j], ExitBlock, this);
       BasicBlock* StartBlock = Preds[j];
       BasicBlock* EndBlock;
-      if (MiddleBlock->getSinglePredecessor() == ExitBlock) {
-        EndBlock = MiddleBlock;
-        MiddleBlock = EndBlock->getSinglePredecessor();;
+      if (NewExitBlock->getSinglePredecessor() == ExitBlock) {
+        EndBlock = NewExitBlock;
+        NewExitBlock = EndBlock->getSinglePredecessor();;
       } else {
         EndBlock = ExitBlock;
       }
       
-      OrigLoopExitMap[StartBlock] = EndBlock;
-
       std::set<PHINode*> InsertedPHIs;
       PHINode* OldLCSSA = 0;
       for (BasicBlock::iterator I = EndBlock->begin();
            (OldLCSSA = dyn_cast<PHINode>(I)); ++I) {
-        Value* OldValue = OldLCSSA->getIncomingValueForBlock(MiddleBlock);
+        Value* OldValue = OldLCSSA->getIncomingValueForBlock(NewExitBlock);
         PHINode* NewLCSSA = PHINode::Create(OldLCSSA->getType(),
                                             OldLCSSA->getName() + ".us-lcssa",
-                                            MiddleBlock->getTerminator());
+                                            NewExitBlock->getTerminator());
         NewLCSSA->addIncoming(OldValue, StartBlock);
-        OldLCSSA->setIncomingValue(OldLCSSA->getBasicBlockIndex(MiddleBlock),
+        OldLCSSA->setIncomingValue(OldLCSSA->getBasicBlockIndex(NewExitBlock),
                                    NewLCSSA);
         InsertedPHIs.insert(NewLCSSA);
       }
 
-      BasicBlock::iterator InsertPt = EndBlock->begin();
-      while (dyn_cast<PHINode>(InsertPt)) ++InsertPt;
-      for (BasicBlock::iterator I = MiddleBlock->begin();
+      BasicBlock::iterator InsertPt = EndBlock->getFirstNonPHI();
+      for (BasicBlock::iterator I = NewExitBlock->begin();
          (OldLCSSA = dyn_cast<PHINode>(I)) && InsertedPHIs.count(OldLCSSA) == 0;
          ++I) {
         PHINode *NewLCSSA = PHINode::Create(OldLCSSA->getType(),
                                             OldLCSSA->getName() + ".us-lcssa",
                                             InsertPt);
         OldLCSSA->replaceAllUsesWith(NewLCSSA);
-        NewLCSSA->addIncoming(OldLCSSA, MiddleBlock);
+        NewLCSSA->addIncoming(OldLCSSA, NewExitBlock);
       }
 
-      if (DF && DT) {
-        // StartBlock -- > MiddleBlock -- > EndBlock
-        // StartBlock is loop exiting block. EndBlock will become merge point 
-        // of two loop exits after loop unswitch.
-        
-        // If StartBlock's DF member includes a block that is not loop member 
-        // then replace that DF member with EndBlock.
-
-        // If MiddleBlock's DF member includes a block that is not loop member
-        // tnen replace that DF member with EndBlock.
-
-        ReplaceLoopExternalDFMember(L, StartBlock, EndBlock);
-        ReplaceLoopExternalDFMember(L, MiddleBlock, EndBlock);
-      }
     }    
   }
 
@@ -718,22 +609,18 @@
 /// condition outside of either loop.  Return the loops created as Out1/Out2.
 void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
                                                Loop *L) {
-  Function *F = L->getHeader()->getParent();
+  Function *F = loopHeader->getParent();
   DOUT << "loop-unswitch: Unswitching loop %"
-       << L->getHeader()->getName() << " [" << L->getBlocks().size()
+       << loopHeader->getName() << " [" << L->getBlocks().size()
        << " blocks] in Function " << F->getName()
        << " when '" << *Val << "' == " << *LIC << "\n";
 
-  // LoopBlocks contains all of the basic blocks of the loop, including the
-  // preheader of the loop, the body of the loop, and the exit blocks of the 
-  // loop, in that order.
-  std::vector<BasicBlock*> LoopBlocks;
+  LoopBlocks.clear();
+  NewBlocks.clear();
 
   // First step, split the preheader and exit blocks, and add these blocks to
   // the LoopBlocks list.
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *NewPreheader = SplitEdge(OrigPreheader, L->getHeader(), this);
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
   LoopBlocks.push_back(NewPreheader);
 
   // We want the loop to come after the preheader, but before the exit blocks.
@@ -744,8 +631,7 @@
 
   // Split all of the edges from inside the loop to their exit blocks.  Update
   // the appropriate Phi nodes as we do so.
-  SmallVector<BasicBlock *,8> MiddleBlocks;
-  SplitExitEdges(L, ExitBlocks, MiddleBlocks);
+  SplitExitEdges(L, ExitBlocks);
 
   // The exit blocks may have been changed due to edge splitting, recompute.
   ExitBlocks.clear();
@@ -757,7 +643,6 @@
   // Next step, clone all of the basic blocks that make up the loop (including
   // the loop preheader and exit blocks), keeping track of the mapping between
   // the instructions and blocks.
-  std::vector<BasicBlock*> NewBlocks;
   NewBlocks.reserve(LoopBlocks.size());
   DenseMap<const Value*, Value*> ValueMap;
   for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
@@ -767,21 +652,6 @@
     LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], New, L);
   }
 
-  // OutSiders are basic block that are dominated by original header and
-  // at the same time they are not part of loop.
-  SmallPtrSet<BasicBlock *, 8> OutSiders;
-  if (DT) {
-    DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-    for(std::vector<DomTreeNode*>::iterator DI = OrigHeaderNode->begin(), 
-          DE = OrigHeaderNode->end();  DI != DE; ++DI) {
-      BasicBlock *B = (*DI)->getBlock();
-
-      DenseMap<const Value*, Value*>::iterator VI = ValueMap.find(B);
-      if (VI == ValueMap.end()) 
-        OutSiders.insert(B);
-    }
-  }
-
   // Splice the newly inserted blocks into the function right before the
   // original preheader.
   F->getBasicBlockList().splice(LoopBlocks[0], F->getBasicBlockList(),
@@ -805,7 +675,7 @@
     assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
            "Exit block should have been split to have one successor!");
     BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
-    
+
     // If the successor of the exit block had PHI nodes, add an entry for
     // NewExit.
     PHINode *PN;
@@ -819,17 +689,13 @@
   }
 
   // Rewrite the code to refer to itself.
-  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
-    BasicBlock *NB = NewBlocks[i];
-    if (BasicBlock *UnwindDest = NB->getUnwindDest())
-      NB->setUnwindDest(cast<BasicBlock>(ValueMap[UnwindDest]));
-
-    for (BasicBlock::iterator I = NB->begin(), E = NB->end(); I != E; ++I)
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
       RemapInstruction(I, ValueMap);
-  }
   
   // Rewrite the original preheader to select between versions of the loop.
-  BranchInst *OldBR = cast<BranchInst>(OrigPreheader->getTerminator());
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
   assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
          "Preheader splitting did not work correctly!");
 
@@ -838,94 +704,6 @@
   LPM->deleteSimpleAnalysisValue(OldBR, L);
   OldBR->eraseFromParent();
 
-  // Update dominator info
-  if (DF && DT) {
-
-    SmallVector<BasicBlock *,4> ExitingBlocks;
-    L->getExitingBlocks(ExitingBlocks);
-
-    // Clone dominator info for all cloned basic block.
-    for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
-      BasicBlock *LBB = LoopBlocks[i];
-      BasicBlock *NBB = NewBlocks[i];
-      CloneDomInfo(NBB, LBB, NewPreheader, OrigPreheader, 
-                   OrigHeader, DT, DF, ValueMap);
-
-      //   If LBB's dominance frontier includes DFMember 
-      //      such that DFMember is also a member of LoopDF then
-      //         - Remove DFMember from LBB's dominance frontier
-      //         - Copy loop exiting blocks', that are dominated by BB,
-      //           dominance frontier member in BB's dominance frontier
-
-      DominanceFrontier::iterator LBBI = DF->find(LBB);
-      DominanceFrontier::iterator NBBI = DF->find(NBB);
-      if (LBBI == DF->end())
-        continue;
-
-      DominanceFrontier::DomSetType &LBSet = LBBI->second;
-      for (DominanceFrontier::DomSetType::iterator LI = LBSet.begin(),
-             LE = LBSet.end(); LI != LE; /* NULL */) {
-        BasicBlock *B = *LI++;
-        if (B == LBB && B == L->getHeader())
-          continue;
-        bool removeB = false;
-        if (!LoopDF.count(B))
-          continue;
-        
-        // If LBB dominates loop exits then insert loop exit block's DF
-        // into B's DF.
-        for(SmallVector<BasicBlock *, 4>::iterator 
-              LExitI = ExitingBlocks.begin(),
-              LExitE = ExitingBlocks.end(); LExitI != LExitE; ++LExitI) {
-          BasicBlock *E = *LExitI;
-          
-          if (!DT->dominates(LBB,E))
-            continue;
-          
-          DenseMap<BasicBlock *, BasicBlock *>::iterator DFBI = 
-            OrigLoopExitMap.find(E);
-          if (DFBI == OrigLoopExitMap.end()) 
-            continue;
-          
-          BasicBlock *DFB = DFBI->second;
-          DF->addToFrontier(LBBI, DFB);
-          DF->addToFrontier(NBBI, DFB);
-          removeB = true;
-        }
-        
-        // If B's replacement is inserted in DF then now is the time to remove
-        // B.
-        if (removeB) {
-          DF->removeFromFrontier(LBBI, B);
-          if (L->contains(B))
-            DF->removeFromFrontier(NBBI, cast<BasicBlock>(ValueMap[B]));
-          else
-            DF->removeFromFrontier(NBBI, B);
-        }
-      }
-
-    }
-
-    // MiddleBlocks are dominated by original pre header. SplitEdge updated
-    // MiddleBlocks' dominance frontier appropriately.
-    for (unsigned i = 0, e = MiddleBlocks.size(); i != e; ++i) {
-      BasicBlock *MBB = MiddleBlocks[i];
-      if (!MBB->getSinglePredecessor())
-        DT->changeImmediateDominator(MBB, OrigPreheader);
-    }
-
-    // All Outsiders are now dominated by original pre header.
-    for (SmallPtrSet<BasicBlock *, 8>::iterator OI = OutSiders.begin(),
-           OE = OutSiders.end(); OI != OE; ++OI) {
-      BasicBlock *OB = *OI;
-      DT->changeImmediateDominator(OB, OrigPreheader);
-    }
-
-    // New loop headers are dominated by original preheader
-    DT->changeImmediateDominator(NewBlocks[0], OrigPreheader);
-    DT->changeImmediateDominator(LoopBlocks[0], OrigPreheader);
-  }
-
   LoopProcessWorklist.push_back(NewLoop);
   redoLoop = true;
 
@@ -937,6 +715,7 @@
   // deleted.  If so, don't simplify it.
   if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop)
     RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true);
+
 }
 
 /// RemoveFromWorklist - Remove all instances of I from the worklist vector
@@ -994,7 +773,7 @@
       // If this is the header of a loop and the only pred is the latch, we now
       // have an unreachable loop.
       if (Loop *L = LI->getLoopFor(BB))
-        if (L->getHeader() == BB && L->contains(Pred)) {
+        if (loopHeader == BB && L->contains(Pred)) {
           // Remove the branch from the latch to the header block, this makes
           // the header dead, which will make the latch dead (because the header
           // dominates the latch).
@@ -1090,8 +869,6 @@
   RemoveLoopFromWorklist(L);
 }
 
-
-
 // RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
 // the value specified by Val in the specified loop, or we know it does NOT have
 // that value.  Rewrite any uses of LIC or of properties correlated to it.
@@ -1153,18 +930,19 @@
               // trying to update it is complicated.  So instead we preserve the
               // loop structure and put the block on an dead code path.
               
+              BasicBlock *SISucc = SI->getSuccessor(i);
               BasicBlock* Old = SI->getParent();
               BasicBlock* Split = SplitBlock(Old, SI, this);
               
               Instruction* OldTerm = Old->getTerminator();
-              BranchInst::Create(Split, SI->getSuccessor(i),
+              BranchInst::Create(Split, SISucc,
                                  ConstantInt::getTrue(), OldTerm);
 
               LPM->deleteSimpleAnalysisValue(Old->getTerminator(), L);
               Old->getTerminator()->eraseFromParent();
               
               PHINode *PN;
-              for (BasicBlock::iterator II = SI->getSuccessor(i)->begin();
+              for (BasicBlock::iterator II = SISucc->begin();
                    (PN = dyn_cast<PHINode>(II)); ++II) {
                 Value *InVal = PN->removeIncomingValue(Split, false);
                 PN->addIncoming(InVal, Old);

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/MemCpyOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/MemCpyOptimizer.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/MemCpyOptimizer.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/MemCpyOptimizer.cpp Sun Jul  6 15:45:41 2008
@@ -14,23 +14,14 @@
 
 #define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
 #include "llvm/ParameterAttributes.h"
-#include "llvm/Value.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Target/TargetData.h"
@@ -40,13 +31,6 @@
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 
-namespace {
-  cl::opt<bool>
-  FormMemSet("form-memset-from-stores",
-             cl::desc("Transform straight-line stores to memsets"),
-             cl::init(true), cl::Hidden);
-}
-
 /// isBytewiseValue - If the specified value can be set by repeating the same
 /// byte in memory, return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
@@ -332,13 +316,9 @@
     }
   
     // Helper fuctions
-    bool processInstruction(Instruction* I,
-                            SmallVectorImpl<Instruction*> &toErase);
-    bool processStore(StoreInst *SI, SmallVectorImpl<Instruction*> &toErase);
-    bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
-                       SmallVectorImpl<Instruction*> &toErase);
-    bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C,
-                              SmallVectorImpl<Instruction*> &toErase);
+    bool processStore(StoreInst *SI, BasicBlock::iterator& BBI);
+    bool processMemCpy(MemCpyInst* M);
+    bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C);
     bool iterateOnFunction(Function &F);
   };
   
@@ -357,8 +337,7 @@
 /// some other patterns to fold away.  In particular, this looks for stores to
 /// neighboring locations of memory.  If it sees enough consequtive ones
 /// (currently 4) it attempts to merge them together into a memcpy/memset.
-bool MemCpyOpt::processStore(StoreInst *SI, SmallVectorImpl<Instruction*> &toErase) {
-  if (!FormMemSet) return false;
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
   if (SI->isVolatile()) return false;
   
   // There are two cases that are interesting for this code to handle: memcpy
@@ -473,8 +452,13 @@
             cerr << *Range.TheStores[i];
           cerr << "With: " << *C); C=C;
   
+    // Don't invalidate the iterator
+    BBI = BI;
+  
     // Zap all the stores.
-    toErase.append(Range.TheStores.begin(), Range.TheStores.end());
+    for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(),
+         SE = Range.TheStores.end(); SI != SE; ++SI)
+      (*SI)->eraseFromParent();
     ++NumMemSetInfer;
     MadeChange = true;
   }
@@ -486,8 +470,7 @@
 /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C,
-                               SmallVectorImpl<Instruction*> &toErase) {
+bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -571,10 +554,17 @@
     User* UI = srcUseList.back();
     srcUseList.pop_back();
 
-    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+    if (isa<BitCastInst>(UI)) {
       for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
            I != E; ++I)
         srcUseList.push_back(*I);
+    } else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(UI)) {
+      if (G->hasAllZeroIndices())
+        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+             I != E; ++I)
+          srcUseList.push_back(*I);
+      else
+        return false;
     } else if (UI != C && UI != cpy) {
       return false;
     }
@@ -597,14 +587,23 @@
     return false;
 
   // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
-    if (CS.getArgument(i) == cpySrc) {
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
       if (cpySrc->getType() != cpyDest->getType())
-        cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(),
+        cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
                                               cpyDest->getName(), C);
-      CS.setArgument(i, cpyDest);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() != cpyDest->getType())
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+                       CS.getArgument(i)->getType(), cpyDest->getName(), C));
+      else
+        CS.setArgument(i, cpyDest);
     }
 
+  if (!changedArgument)
+    return false;
+
   // Drop any cached information about the call, because we may have changed
   // its dependence information by changing its parameter.
   MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
@@ -612,7 +611,8 @@
 
   // Remove the memcpy
   MD.removeInstruction(cpy);
-  toErase.push_back(cpy);
+  cpy->eraseFromParent();
+  NumMemCpyInstr++;
 
   return true;
 }
@@ -621,8 +621,25 @@
 /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
 /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
 ///  This allows later passes to remove the first memcpy altogether.
-bool MemCpyOpt::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
-                        SmallVectorImpl<Instruction*> &toErase) {
+bool MemCpyOpt::processMemCpy(MemCpyInst* M) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+
+  // The are two possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE
+  //   b) call-memcpy xform for return slot optimization
+  Instruction* dep = MD.getDependency(M);
+  if (dep == MemoryDependenceAnalysis::None ||
+      dep == MemoryDependenceAnalysis::NonLocal)
+    return false;
+  else if (!isa<MemCpyInst>(dep)) {
+    if (CallInst* C = dyn_cast<CallInst>(dep))
+      return performCallSlotOptzn(M, C);
+    else
+      return false;
+  }
+  
+  MemCpyInst* MDep = cast<MemCpyInst>(dep);
+  
   // We can only transforms memcpy's where the dest of one is the source of the
   // other
   if (M->getSource() != MDep->getDest())
@@ -667,41 +684,22 @@
   
   CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M);
   
-  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  
+  // If C and M don't interfere, then this is a valid transformation.  If they
+  // did, this would mean that the two sources overlap, which would be bad.
   if (MD.getDependency(C) == MDep) {
     MD.dropInstruction(M);
-    toErase.push_back(M);
+    M->eraseFromParent();
+    
+    NumMemCpyInstr++;
+    
     return true;
   }
   
+  // Otherwise, there was no point in doing this, so we remove the call we
+  // inserted and act like nothing happened.
   MD.removeInstruction(C);
-  toErase.push_back(C);
-  return false;
-}
-
-/// processInstruction - When calculating availability, handle an instruction
-/// by inserting it into the appropriate sets
-bool MemCpyOpt::processInstruction(Instruction *I,
-                                   SmallVectorImpl<Instruction*> &toErase) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return processStore(SI, toErase);
-  
-  if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
-    MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
-
-    // The are two possible optimizations we can do for memcpy:
-    //   a) memcpy-memcpy xform which exposes redundance for DSE
-    //   b) call-memcpy xform for return slot optimization
-    Instruction* dep = MD.getDependency(M);
-    if (dep == MemoryDependenceAnalysis::None ||
-        dep == MemoryDependenceAnalysis::NonLocal)
-      return false;
-    if (MemCpyInst *MemCpy = dyn_cast<MemCpyInst>(dep))
-      return processMemCpy(M, MemCpy, toErase);
-    if (CallInst* C = dyn_cast<CallInst>(dep))
-      return performCallSlotOptzn(M, C, toErase);
-    return false;
-  }
+  C->eraseFromParent();
   
   return false;
 }
@@ -726,42 +724,19 @@
 // MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
 bool MemCpyOpt::iterateOnFunction(Function &F) {
   bool changed_function = false;
-  
-  DominatorTree &DT = getAnalysis<DominatorTree>();   
-  
-  SmallVector<Instruction*, 8> toErase;
-
-  // Top-down walk of the dominator tree
-  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
-         E = df_end(DT.getRootNode()); DI != E; ++DI) {
 
-    BasicBlock* BB = DI->getBlock();
+  // Walk all instruction in the function
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
          BI != BE;) {
-      changed_function |= processInstruction(BI, toErase);
-      if (toErase.empty()) {
-        ++BI;
-        continue;
-      }
-      
-      // If we need some instructions deleted, do it now.
-      NumMemCpyInstr += toErase.size();
+      // Avoid invalidating the iterator
+      Instruction* I = BI++;
       
-      // Avoid iterator invalidation.
-      bool AtStart = BI == BB->begin();
-      if (!AtStart)
-        --BI;
-
-      for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
-           E = toErase.end(); I != E; ++I)
-        (*I)->eraseFromParent();
-
-      if (AtStart)
-        BI = BB->begin();
-      else
-        ++BI;
-      
-      toErase.clear();
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        changed_function |= processStore(SI, BI);
+      else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
+        changed_function |= processMemCpy(M);
+      }
     }
   }
   

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/PredicateSimplifier.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/PredicateSimplifier.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/PredicateSimplifier.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/PredicateSimplifier.cpp Sun Jul  6 15:45:41 2008
@@ -1509,7 +1509,7 @@
       }
 
       // We'd like to allow makeEqual on two values to perform a simple
-      // substitution without every creating nodes in the IG whenever possible.
+      // substitution without creating nodes in the IG whenever possible.
       //
       // The first iteration through this loop operates on V2 before going
       // through the Remove list and operating on those too. If all of the
@@ -1594,6 +1594,7 @@
       if (mergeIGNode) {
         // Create N1.
         if (!n1) n1 = VN.getOrInsertVN(V1, Top);
+        IG.node(n1); // Ensure that IG.Nodes won't get resized
 
         // Migrate relationships from removed nodes to N1.
         for (SetVector<unsigned>::iterator I = Remove.begin(), E = Remove.end();
@@ -2646,12 +2647,12 @@
       }
     }
   }
-
-  char PredicateSimplifier::ID = 0;
-  RegisterPass<PredicateSimplifier> X("predsimplify",
-                                      "Predicate Simplifier");
 }
 
+char PredicateSimplifier::ID = 0;
+static RegisterPass<PredicateSimplifier>
+X("predsimplify", "Predicate Simplifier");
+
 FunctionPass *llvm::createPredicateSimplifierPass() {
   return new PredicateSimplifier();
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/Reassociate.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/Reassociate.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/Reassociate.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/Reassociate.cpp Sun Jul  6 15:45:41 2008
@@ -64,7 +64,7 @@
       << "," << Ops[i].Rank;
 }
   
-namespace {  
+namespace {
   class VISIBILITY_HIDDEN Reassociate : public FunctionPass {
     std::map<BasicBlock*, unsigned> RankMap;
     std::map<Value*, unsigned> ValueRankMap;
@@ -92,11 +92,11 @@
     
     void RemoveDeadBinaryOp(Value *V);
   };
-
-  char Reassociate::ID = 0;
-  RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
 }
 
+char Reassociate::ID = 0;
+static RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
+
 // Public interface to the Reassociate pass
 FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
 
@@ -194,7 +194,7 @@
 static Instruction *LowerNegateToMultiply(Instruction *Neg) {
   Constant *Cst = ConstantInt::getAllOnesValue(Neg->getType());
 
-  Instruction *Res = BinaryOperator::createMul(Neg->getOperand(1), Cst, "",Neg);
+  Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
   Res->takeName(Neg);
   Neg->replaceAllUsesWith(Res);
   Neg->eraseFromParent();
@@ -389,7 +389,7 @@
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
   //
-  return BinaryOperator::createNeg(V, V->getName() + ".neg", BI);
+  return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
 }
 
 /// ShouldBreakUpSubtract - Return true if we should break up this subtract of
@@ -427,7 +427,7 @@
   //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
   Instruction *New =
-    BinaryOperator::createAdd(Sub->getOperand(0), NegVal, "", Sub);
+    BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
   New->takeName(Sub);
 
   // Everyone now refers to the add instruction.
@@ -451,7 +451,7 @@
     Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
     MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
     
-    Instruction *Mul = BinaryOperator::createMul(Shl->getOperand(0), MulCst,
+    Instruction *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst,
                                                  "", Shl);
     Mul->takeName(Shl);
     Shl->replaceAllUsesWith(Mul);
@@ -485,7 +485,7 @@
   Value *V1 = Ops.back();
   Ops.pop_back();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
-  return BinaryOperator::createAdd(V2, V1, "tmp", I);
+  return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
 }
 
 /// RemoveFactorFromExpression - If V is an expression tree that is a 
@@ -714,7 +714,7 @@
       // this, we could otherwise run into situations where removing a factor
       // from an expression will drop a use of maxocc, and this can cause 
       // RemoveFactorFromExpression on successive values to behave differently.
-      Instruction *DummyInst = BinaryOperator::createAdd(MaxOccVal, MaxOccVal);
+      Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
       std::vector<Value*> NewMulOps;
       for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
         if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
@@ -729,7 +729,7 @@
 
       unsigned NumAddedValues = NewMulOps.size();
       Value *V = EmitAddTreeOfValues(I, NewMulOps);
-      Value *V2 = BinaryOperator::createMul(V, MaxOccVal, "tmp", I);
+      Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
 
       // Now that we have inserted V and its sole use, optimize it. This allows
       // us to handle cases that require multiple factoring steps, such as this:

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/Reg2Mem.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/Reg2Mem.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/Reg2Mem.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/Reg2Mem.cpp Sun Jul  6 15:45:41 2008
@@ -67,7 +67,7 @@
         while (isa<AllocaInst>(I)) ++I;
 
         CastInst *AllocaInsertionPoint =
-          CastInst::create(Instruction::BitCast,
+          CastInst::Create(Instruction::BitCast,
                            Constant::getNullValue(Type::Int32Ty), Type::Int32Ty,
                            "reg2mem alloca point", I);
 
@@ -111,14 +111,15 @@
       return false;
     }
   };
-  
-  char RegToMem::ID = 0;
-  RegisterPass<RegToMem> X("reg2mem", "Demote all values to stack slots");
 }
+  
+char RegToMem::ID = 0;
+static RegisterPass<RegToMem>
+X("reg2mem", "Demote all values to stack slots");
 
 // createDemoteRegisterToMemory - Provide an entry point to create this pass.
 //
-const PassInfo *llvm::DemoteRegisterToMemoryID = X.getPassInfo();
+const PassInfo *const llvm::DemoteRegisterToMemoryID = &X;
 FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
   return new RegToMem();
 }

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/SCCP.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/SCCP.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/SCCP.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/SCCP.cpp Sun Jul  6 15:45:41 2008
@@ -29,6 +29,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Compiler.h"
@@ -130,21 +131,6 @@
   }
 };
 
-/// LatticeValIndex - LatticeVal and associated Index. This is used
-/// to track individual operand Lattice values for multi value ret instructions.
-class VISIBILITY_HIDDEN LatticeValIndexed {
- public:
-  LatticeValIndexed(unsigned I = 0) { Index = I; }
-  LatticeVal &getLatticeVal() { return LV; }
-  unsigned getIndex() const { return Index; }
-
-  void setLatticeVal(LatticeVal &L) { LV = L; }
-  void setIndex(unsigned I) { Index = I; }
-
- private:
-  LatticeVal LV;
-  unsigned Index;
-};
 //===----------------------------------------------------------------------===//
 //
 /// SCCPSolver - This class is a general purpose solver for Sparse Conditional
@@ -167,7 +153,7 @@
 
   /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
   /// that return multiple values.
-  std::multimap<Function*, LatticeValIndexed> TrackedMultipleRetVals;
+  std::map<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
 
   // The reason for two worklists is that overdefined is the lowest state
   // on the lattice, and moving things to overdefined as fast as possible
@@ -194,7 +180,7 @@
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
   void MarkBlockExecutable(BasicBlock *BB) {
-    DOUT << "Marking Block Executable: " << BB->getName() << "\n";
+    DOUT << "Marking Block Executable: " << BB->getNameStart() << "\n";
     BBExecutable.insert(BB);   // Basic block is executable!
     BBWorkList.push_back(BB);  // Add the block to the work list!
   }
@@ -220,11 +206,10 @@
     // Add an entry, F -> undef.
     if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-        TrackedMultipleRetVals.insert(std::pair<Function *, LatticeValIndexed>
-                                      (F, LatticeValIndexed(i)));
-    }
-    else
-      TrackedRetVals[F];
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
   }
 
   /// Solve - Solve for constants and executable blocks.
@@ -291,7 +276,6 @@
   // markOverdefined - Make a value be marked as "overdefined". If the
   // value is not already overdefined, add it to the overdefined instruction
   // work list so that the users of the instruction are updated later.
-
   inline void markOverdefined(LatticeVal &IV, Value *V) {
     if (IV.markOverdefined()) {
       DEBUG(DOUT << "markOverdefined: ";
@@ -351,8 +335,8 @@
       return;  // This edge is already known to be executable!
 
     if (BBExecutable.count(Dest)) {
-      DOUT << "Marking Edge Executable: " << Source->getName()
-           << " -> " << Dest->getName() << "\n";
+      DOUT << "Marking Edge Executable: " << Source->getNameStart()
+           << " -> " << Dest->getNameStart() << "\n";
 
       // The destination is already executable, but we just made an edge
       // feasible that wasn't before.  Revisit the PHI nodes in the block
@@ -407,6 +391,8 @@
   void visitExtractElementInst(ExtractElementInst &I);
   void visitInsertElementInst(InsertElementInst &I);
   void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
 
   // Instructions that cannot be folded away...
   void visitStoreInst     (Instruction &I);
@@ -465,20 +451,8 @@
         (SCValue.isConstant() && !isa<ConstantInt>(SCValue.getConstant()))) {
       // All destinations are executable!
       Succs.assign(TI.getNumSuccessors(), true);
-    } else if (SCValue.isConstant()) {
-      Constant *CPV = SCValue.getConstant();
-      // Make sure to skip the "default value" which isn't a value
-      for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i) {
-        if (SI->getSuccessorValue(i) == CPV) {// Found the right branch...
-          Succs[i] = true;
-          return;
-        }
-      }
-
-      // Constant value not equal to any of the branches... must execute
-      // default branch then...
-      Succs[0] = true;
-    }
+    } else if (SCValue.isConstant())
+      Succs[SI->findCaseValue(cast<ConstantInt>(SCValue.getConstant()))] = true;
   } else {
     assert(0 && "SCCP: Don't know how to handle this terminator!");
   }
@@ -640,7 +614,7 @@
   if (!F->hasInternalLinkage())
     return;
 
-  if (!TrackedRetVals.empty()) {
+  if (!TrackedRetVals.empty() && I.getNumOperands() == 1) {
     DenseMap<Function*, LatticeVal>::iterator TFRVI =
       TrackedRetVals.find(F);
     if (TFRVI != TrackedRetVals.end() &&
@@ -651,15 +625,24 @@
     }
   }
   
-  // Handle function that returns multiple values.
-  std::multimap<Function*, LatticeValIndexed>::iterator It, E;
-  tie(It, E) = TrackedMultipleRetVals.equal_range(F);
-  if (It != E) {
-    for (; It != E; ++It) {
-      LatticeValIndexed &LV = It->second;
-      unsigned Idx = LV.getIndex();
-      Value *V = I.getOperand(Idx);
-      mergeInValue(LV.getLatticeVal(), V, getValueState(V));
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty() && I.getNumOperands() > 1) {
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+      std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      mergeInValue(It->second, F, getValueState(I.getOperand(i)));
+    }
+  } else if (!TrackedMultipleRetVals.empty() &&
+             I.getNumOperands() == 1 &&
+             isa<StructType>(I.getOperand(0)->getType())) {
+    for (unsigned i = 0, e = I.getOperand(0)->getType()->getNumContainedTypes();
+         i != e; ++i) {
+      std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      Value *Val = FindInsertedValue(I.getOperand(0), i);
+      mergeInValue(It->second, F, getValueState(Val));
     }
   }
 }
@@ -687,28 +670,124 @@
 }
 
 void SCCPSolver::visitGetResultInst(GetResultInst &GRI) {
-  unsigned Idx = GRI.getIndex();
   Value *Aggr = GRI.getOperand(0);
-  Function *F = NULL;
+
+  // If the operand to the getresult is an undef, the result is undef.
+  if (isa<UndefValue>(Aggr))
+    return;
+  
+  Function *F;
+  if (CallInst *CI = dyn_cast<CallInst>(Aggr))
+    F = CI->getCalledFunction();
+  else
+    F = cast<InvokeInst>(Aggr)->getCalledFunction();
+
+  // TODO: If IPSCCP resolves the callee of this function, we could propagate a
+  // result back!
+  if (F == 0 || TrackedMultipleRetVals.empty()) {
+    markOverdefined(&GRI);
+    return;
+  }
+  
+  // See if we are tracking the result of the callee.
+  std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    It = TrackedMultipleRetVals.find(std::make_pair(F, GRI.getIndex()));
+
+  // If not tracking this function (for example, it is a declaration) just move
+  // to overdefined.
+  if (It == TrackedMultipleRetVals.end()) {
+    markOverdefined(&GRI);
+    return;
+  }
+  
+  // Otherwise, the value will be merged in here as a result of CallSite
+  // handling.
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  Value *Aggr = EVI.getAggregateOperand();
+
+  // If the operand to the extractvalue is an undef, the result is undef.
+  if (isa<UndefValue>(Aggr))
+    return;
+
+  // Currently only handle single-index extractvalues.
+  if (EVI.getNumIndices() != 1) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  Function *F = 0;
   if (CallInst *CI = dyn_cast<CallInst>(Aggr))
     F = CI->getCalledFunction();
   else if (InvokeInst *II = dyn_cast<InvokeInst>(Aggr))
     F = II->getCalledFunction();
 
-  if (!F)
+  // TODO: If IPSCCP resolves the callee of this function, we could propagate a
+  // result back!
+  if (F == 0 || TrackedMultipleRetVals.empty()) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // See if we are tracking the result of the callee.
+  std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    It = TrackedMultipleRetVals.find(std::make_pair(F, *EVI.idx_begin()));
+
+  // If not tracking this function (for example, it is a declaration) just move
+  // to overdefined.
+  if (It == TrackedMultipleRetVals.end()) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // Otherwise, the value will be merged in here as a result of CallSite
+  // handling.
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  Value *Aggr = IVI.getAggregateOperand();
+  Value *Val = IVI.getInsertedValueOperand();
+
+  // If the operands to the insertvalue are undef, the result is undef.
+  if (isa<UndefValue>(Aggr) && isa<UndefValue>(Val))
     return;
 
-  std::multimap<Function*, LatticeValIndexed>::iterator It, E;
-  tie(It, E) = TrackedMultipleRetVals.equal_range(F);
-  if (It == E) 
+  // Currently only handle single-index insertvalues.
+  if (IVI.getNumIndices() != 1) {
+    markOverdefined(&IVI);
     return;
+  }
 
-  for (; It != E; ++It) {
-    LatticeValIndexed &LIV = It->second;
-    if (LIV.getIndex() == Idx) {
-      mergeInValue(&GRI, LIV.getLatticeVal());
+  // Currently only handle insertvalue instructions that are in a single-use
+  // chain that builds up a return value.
+  for (const InsertValueInst *TmpIVI = &IVI; ; ) {
+    if (!TmpIVI->hasOneUse()) {
+      markOverdefined(&IVI);
+      return;
+    }
+    const Value *V = *TmpIVI->use_begin();
+    if (isa<ReturnInst>(V))
+      break;
+    TmpIVI = dyn_cast<InsertValueInst>(V);
+    if (!TmpIVI) {
+      markOverdefined(&IVI);
+      return;
     }
   }
+  
+  // See if we are tracking the result of the callee.
+  Function *F = IVI.getParent()->getParent();
+  std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    It = TrackedMultipleRetVals.find(std::make_pair(F, *IVI.idx_begin()));
+
+  // Merge in the inserted member value.
+  if (It != TrackedMultipleRetVals.end())
+    mergeInValue(It->second, F, getValueState(Val));
+
+  // Mark the aggregate result of the IVI overdefined; any tracking that we do
+  // will be done on the individual member values.
+  markOverdefined(&IVI);
 }
 
 void SCCPSolver::visitSelectInst(SelectInst &I) {
@@ -1127,79 +1206,101 @@
 
 void SCCPSolver::visitCallSite(CallSite CS) {
   Function *F = CS.getCalledFunction();
-
-  DenseMap<Function*, LatticeVal>::iterator TFRVI =TrackedRetVals.end();
-  // If we are tracking this function, we must make sure to bind arguments as
-  // appropriate.
-  bool FirstCall = false;
-  if (F && F->hasInternalLinkage()) {
-    TFRVI = TrackedRetVals.find(F);
-    if (TFRVI != TrackedRetVals.end()) 
-      FirstCall = true;
-    else {
-      std::multimap<Function*, LatticeValIndexed>::iterator It, E;
-      tie(It, E) = TrackedMultipleRetVals.equal_range(F);
-      if (It != E) 
-        FirstCall = true;
-    }
-  }
-
-  if (FirstCall) {
-    // If this is the first call to the function hit, mark its entry block
-    // executable.
-    if (!BBExecutable.count(F->begin()))
-      MarkBlockExecutable(F->begin());
+  Instruction *I = CS.getInstruction();
+  
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (F == 0 || !F->hasInternalLinkage()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType() == Type::VoidTy) return;
     
-    CallSite::arg_iterator CAI = CS.arg_begin();
-    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
-         AI != E; ++AI, ++CAI) {
-      LatticeVal &IV = ValueState[AI];
-      if (!IV.isOverdefined())
-        mergeInValue(IV, AI, getValueState(*CAI));
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (!isa<StructType>(I->getType()) && F && F->isDeclaration() && 
+        canConstantFoldCallTo(F)) {
+      
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal &State = getValueState(*AI);
+        if (State.isUndefined())
+          return;  // Operands are not resolved yet.
+        else if (State.isOverdefined()) {
+          markOverdefined(I);
+          return;
+        }
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+     
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(F, &Operands[0], Operands.size())) {
+        markConstant(I, C);
+        return;
+      }
     }
-  }
-  Instruction *I = CS.getInstruction();
-
-  if (!CS.doesNotThrow() && I->getParent()->getUnwindDest())
-    markEdgeExecutable(I->getParent(), I->getParent()->getUnwindDest());
-
-  if (I->getType() == Type::VoidTy) return;
-
-  LatticeVal &IV = ValueState[I];
-  if (IV.isOverdefined()) return;
-
-  // Propagate the single return value of the function to the value of the 
-  // instruction.
-  if (TFRVI != TrackedRetVals.end()) {
-    mergeInValue(IV, I, TFRVI->second);
-    return;
-  }
 
-  if (F == 0 || !F->isDeclaration() || !canConstantFoldCallTo(F)) {
-    markOverdefined(IV, I);
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    markOverdefined(I);
     return;
   }
 
-  SmallVector<Constant*, 8> Operands;
-  Operands.reserve(I->getNumOperands()-1);
-
-  for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
-       AI != E; ++AI) {
-    LatticeVal &State = getValueState(*AI);
-    if (State.isUndefined())
-      return;  // Operands are not resolved yet...
-    else if (State.isOverdefined()) {
-      markOverdefined(IV, I);
-      return;
+  // If this is a single/zero retval case, see if we're tracking the function.
+  DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+  if (TFRVI != TrackedRetVals.end()) {
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  } else if (isa<StructType>(I->getType())) {
+    // Check to see if we're tracking this callee, if not, handle it in the
+    // common path above.
+    std::map<std::pair<Function*, unsigned>, LatticeVal>::iterator
+      TMRVI = TrackedMultipleRetVals.find(std::make_pair(F, 0));
+    if (TMRVI == TrackedMultipleRetVals.end())
+      goto CallOverdefined;
+    
+    // If we are tracking this callee, propagate the return values of the call
+    // into this call site.  We do this by walking all the uses. Single-index
+    // ExtractValueInst uses can be tracked; anything more complicated is
+    // currently handled conservatively.
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+         UI != E; ++UI) {
+      if (GetResultInst *GRI = dyn_cast<GetResultInst>(*UI)) {
+        mergeInValue(GRI, 
+                TrackedMultipleRetVals[std::make_pair(F, GRI->getIndex())]);
+        continue;
+      }
+      if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(*UI)) {
+        if (EVI->getNumIndices() == 1) {
+          mergeInValue(EVI, 
+                  TrackedMultipleRetVals[std::make_pair(F, *EVI->idx_begin())]);
+          continue;
+        }
+      }
+      // The aggregate value is used in a way not handled here. Assume nothing.
+      markOverdefined(*UI);
     }
-    assert(State.isConstant() && "Unknown state!");
-    Operands.push_back(State.getConstant());
+  } else {
+    // Otherwise we're not tracking this callee, so handle it in the
+    // common path above.
+    goto CallOverdefined;
+  }
+   
+  // Finally, if this is the first call to the function hit, mark its entry
+  // block executable.
+  if (!BBExecutable.count(F->begin()))
+    MarkBlockExecutable(F->begin());
+  
+  // Propagate information from this call site into the callee.
+  CallSite::arg_iterator CAI = CS.arg_begin();
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI, ++CAI) {
+    LatticeVal &IV = ValueState[AI];
+    if (!IV.isOverdefined())
+      mergeInValue(IV, AI, getValueState(*CAI));
   }
-
-  if (Constant *C = ConstantFoldCall(F, &Operands[0], Operands.size()))
-    markConstant(IV, I, C);
-  else
-    markOverdefined(IV, I);
 }
 
 
@@ -1380,6 +1481,12 @@
         else
           markOverdefined(LV, I);
         return true;
+      case Instruction::Call:
+        // If a call has an undef result, it is because it is constant foldable
+        // but one of the inputs was undef.  Just force the result to
+        // overdefined.
+        markOverdefined(LV, I);
+        return true;
       }
     }
   
@@ -1389,6 +1496,8 @@
       if (!getValueState(BI->getCondition()).isUndefined())
         continue;
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (SI->getNumSuccessors()<2)   // no cases
+        continue;
       if (!getValueState(SI->getCondition()).isUndefined())
         continue;
     } else {
@@ -1445,11 +1554,11 @@
       AU.setPreservesCFG();
     }
   };
-
-  char SCCP::ID = 0;
-  RegisterPass<SCCP> X("sccp", "Sparse Conditional Constant Propagation");
 } // end anonymous namespace
 
+char SCCP::ID = 0;
+static RegisterPass<SCCP>
+X("sccp", "Sparse Conditional Constant Propagation");
 
 // createSCCPPass - This is the public interface to this file...
 FunctionPass *llvm::createSCCPPass() {
@@ -1461,7 +1570,7 @@
 // and return true if the function was modified.
 //
 bool SCCP::runOnFunction(Function &F) {
-  DOUT << "SCCP on function '" << F.getName() << "'\n";
+  DOUT << "SCCP on function '" << F.getNameStart() << "'\n";
   SCCPSolver Solver;
 
   // Mark the first block of the function as being executable.
@@ -1514,25 +1623,28 @@
       //
       for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
         Instruction *Inst = BI++;
-        if (Inst->getType() != Type::VoidTy) {
-          LatticeVal &IV = Values[Inst];
-          if ((IV.isConstant() || IV.isUndefined()) &&
-              !isa<TerminatorInst>(Inst)) {
-            Constant *Const = IV.isConstant()
-              ? IV.getConstant() : UndefValue::get(Inst->getType());
-            DOUT << "  Constant: " << *Const << " = " << *Inst;
-
-            // Replaces all of the uses of a variable with uses of the constant.
-            Inst->replaceAllUsesWith(Const);
-
-            // Delete the instruction.
-            BB->getInstList().erase(Inst);
-
-            // Hey, we just changed something!
-            MadeChanges = true;
-            ++NumInstRemoved;
-          }
-        }
+        if (Inst->getType() == Type::VoidTy ||
+            isa<StructType>(Inst->getType()) ||
+            isa<TerminatorInst>(Inst))
+          continue;
+        
+        LatticeVal &IV = Values[Inst];
+        if (!IV.isConstant() && !IV.isUndefined())
+          continue;
+        
+        Constant *Const = IV.isConstant()
+          ? IV.getConstant() : UndefValue::get(Inst->getType());
+        DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+        // Replaces all of the uses of a variable with uses of the constant.
+        Inst->replaceAllUsesWith(Const);
+        
+        // Delete the instruction.
+        Inst->eraseFromParent();
+        
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++NumInstRemoved;
       }
     }
 
@@ -1550,12 +1662,12 @@
     IPSCCP() : ModulePass((intptr_t)&ID) {}
     bool runOnModule(Module &M);
   };
-
-  char IPSCCP::ID = 0;
-  RegisterPass<IPSCCP>
-  Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
 } // end anonymous namespace
 
+char IPSCCP::ID = 0;
+static RegisterPass<IPSCCP>
+Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
+
 // createIPSCCPPass - This is the public interface to this file...
 ModulePass *llvm::createIPSCCPPass() {
   return new IPSCCP();
@@ -1688,27 +1800,30 @@
       } else {
         for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
           Instruction *Inst = BI++;
-          if (Inst->getType() != Type::VoidTy) {
-            LatticeVal &IV = Values[Inst];
-            if (IV.isConstant() ||
-                (IV.isUndefined() && !isa<TerminatorInst>(Inst))) {
-              Constant *Const = IV.isConstant()
-                ? IV.getConstant() : UndefValue::get(Inst->getType());
-              DOUT << "  Constant: " << *Const << " = " << *Inst;
-
-              // Replaces all of the uses of a variable with uses of the
-              // constant.
-              Inst->replaceAllUsesWith(Const);
-
-              // Delete the instruction.
-              if (!isa<TerminatorInst>(Inst) && !isa<CallInst>(Inst))
-                BB->getInstList().erase(Inst);
-
-              // Hey, we just changed something!
-              MadeChanges = true;
-              ++IPNumInstRemoved;
-            }
-          }
+          if (Inst->getType() == Type::VoidTy ||
+              isa<StructType>(Inst->getType()) ||
+              isa<TerminatorInst>(Inst))
+            continue;
+          
+          LatticeVal &IV = Values[Inst];
+          if (!IV.isConstant() && !IV.isUndefined())
+            continue;
+          
+          Constant *Const = IV.isConstant()
+            ? IV.getConstant() : UndefValue::get(Inst->getType());
+          DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          Inst->replaceAllUsesWith(Const);
+          
+          // Delete the instruction.
+          if (!isa<CallInst>(Inst))
+            Inst->eraseFromParent();
+
+          // Hey, we just changed something!
+          MadeChanges = true;
+          ++IPNumInstRemoved;
         }
       }
 
@@ -1719,11 +1834,6 @@
       // If there are any PHI nodes in this successor, drop entries for BB now.
       BasicBlock *DeadBB = BlocksToErase[i];
       while (!DeadBB->use_empty()) {
-        if (BasicBlock *PredBB = dyn_cast<BasicBlock>(DeadBB->use_back())) {
-          PredBB->setUnwindDest(NULL);
-          continue;
-        }
-
         Instruction *I = cast<Instruction>(DeadBB->use_back());
         bool Folded = ConstantFoldTerminator(I->getParent());
         if (!Folded) {
@@ -1783,7 +1893,7 @@
     GlobalVariable *GV = I->first;
     assert(!I->second.isOverdefined() &&
            "Overdefined values should have been taken out of the map!");
-    DOUT << "Found that GV '" << GV->getName()<< "' is constant!\n";
+    DOUT << "Found that GV '" << GV->getNameStart() << "' is constant!\n";
     while (!GV->use_empty()) {
       StoreInst *SI = cast<StoreInst>(GV->use_back());
       SI->eraseFromParent();

Modified: llvm/branches/non-call-eh/lib/Transforms/Scalar/ScalarReplAggregates.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/ScalarReplAggregates.cpp?rev=53163&r1=53162&r2=53163&view=diff

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/ScalarReplAggregates.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/ScalarReplAggregates.cpp Sun Jul  6 15:45:41 2008
@@ -124,11 +124,11 @@
                                       unsigned Offset);
     static Instruction *isOnlyCopiedFromConstantGlobal(AllocationInst *AI);
   };
-
-  char SROA::ID = 0;
-  RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
 }
 
+char SROA::ID = 0;
+static RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
+
 // Public interface to the ScalarReplAggregates pass
 FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { 
   return new SROA(Threshold);
@@ -178,6 +178,14 @@
   return Changed;
 }
 
+/// getNumSAElements - Return the number of elements in the specific struct or
+/// array.
+static uint64_t getNumSAElements(const Type *T) {
+  if (const StructType *ST = dyn_cast<StructType>(T))
+    return ST->getNumElements();
+  return cast<ArrayType>(T)->getNumElements();
+}
+
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
 // which runs on all of the malloc/alloca instructions in the function, removing
 // them if they are only used by getelementptr instructions.
@@ -224,7 +232,10 @@
         (isa<StructType>(AI->getAllocatedType()) ||
          isa<ArrayType>(AI->getAllocatedType())) &&
         AI->getAllocatedType()->isSized() &&
-        TD.getABITypeSize(AI->getAllocatedType()) < SRThreshold) {
+        // Do not promote any struct whose size is larger than "128" bytes.
+        TD.getABITypeSize(AI->getAllocatedType()) < SRThreshold &&
+        // Do not promote any struct into more than "32" separate vars.
+        getNumSAElements(AI->getAllocatedType()) < SRThreshold/4) {
       // Check that all of the users of the allocation are capable of being
       // transformed.
       switch (isSafeAllocaToScalarRepl(AI)) {
@@ -302,6 +313,43 @@
       continue;
     }
     
+    // Replace:
+    //   %res = load { i32, i32 }* %alloc
+    // with:
+    //   %load.0 = load i32* %alloc.0
+    //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 
+    //   %load.1 = load i32* %alloc.1
+    //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 
+    // (Also works for arrays instead of structs)
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      Value *Insert = UndefValue::get(LI->getType());
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Load = new LoadInst(ElementAllocas[i], "load", LI);
+        Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+      }
+      LI->replaceAllUsesWith(Insert);
+      LI->eraseFromParent();
+      continue;
+    }
+
+    // Replace:
+    //   store { i32, i32 } %val, { i32, i32 }* %alloc
+    // with:
+    //   %val.0 = extractvalue { i32, i32 } %val, 0 
+    //   store i32 %val.0, i32* %alloc.0
+    //   %val.1 = extractvalue { i32, i32 } %val, 1 
+    //   store i32 %val.1, i32* %alloc.1
+    // (Also works for arrays instead of structs)
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
+        new StoreInst(Extract, ElementAllocas[i], SI);
+      }
+      SI->eraseFromParent();
+      continue;
+    }
+    
     GetElementPtrInst *GEPI = cast<GetElementPtrInst>(User);
     // We now know that the GEP is of the form: GEP <ptr>, 0, <cst>
     unsigned Idx =
@@ -440,6 +488,12 @@
   if (BitCastInst *C = dyn_cast<BitCastInst>(User))
     return isSafeUseOfBitCastedAllocation(C, AI, Info);
 
+  if (isa<LoadInst>(User))
+    return; // Loads (returning a first class aggregrate) are always rewritable
+
+  if (isa<StoreInst>(User) && User->getOperand(0) != AI)
+    return; // Store is ok if storing INTO the pointer, not storing the pointer
+ 
   GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User);
   if (GEPI == 0)
     return MarkUnsafe(Info);
@@ -631,11 +685,9 @@
       // If this is a memcpy/memmove, emit a GEP of the other element address.
       Value *OtherElt = 0;
       if (OtherPtr) {
-        Value *Idx[2];
-        Idx[0] = Zero;
-        Idx[1] = ConstantInt::get(Type::Int32Ty, i);
+        Value *Idx[2] = { Zero, ConstantInt::get(Type::Int32Ty, i) };
         OtherElt = GetElementPtrInst::Create(OtherPtr, Idx, Idx + 2,
-                                             OtherPtr->getNameStr()+"."+utostr(i),
+                                           OtherPtr->getNameStr()+"."+utostr(i),
                                              MI);
       }
 
@@ -643,7 +695,7 @@
       const Type *EltTy =cast<PointerType>(EltPtr->getType())->getElementType();
       
       // If we got down to a scalar, insert a load or store as appropriate.
-      if (EltTy->isFirstClassType()) {
+      if (EltTy->isSingleValueType()) {
         if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
           Value *Elt = new LoadInst(SROADest ? OtherElt : EltPtr, "tmp",
                                     MI);
@@ -737,8 +789,7 @@
 
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding, false otherwise.
-static bool HasPadding(const Type *Ty, const TargetData &TD,
-                       bool inPacked = false) {
+static bool HasPadding(const Type *Ty, const TargetData &TD) {
   if (const StructType *STy = dyn_cast<StructType>(Ty)) {
     const StructLayout *SL = TD.getStructLayout(STy);
     unsigned PrevFieldBitOffset = 0;
@@ -746,7 +797,7 @@
       unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
 
       // Padding in sub-elements?
-      if (HasPadding(STy->getElementType(i), TD, STy->isPacked()))
+      if (HasPadding(STy->getElementType(i), TD))
         return true;
 
       // Check to see if there is any padding between this element and the
@@ -770,12 +821,11 @@
     }
 
   } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    return HasPadding(ATy->getElementType(), TD, false);
+    return HasPadding(ATy->getElementType(), TD);
   } else if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
-    return HasPadding(VTy->getElementType(), TD, false);
+    return HasPadding(VTy->getElementType(), TD);
   }
-  return inPacked ?
-    false : TD.getTypeSizeInBits(Ty) != TD.getABITypeSizeInBits(Ty);
+  return TD.getTypeSizeInBits(Ty) != TD.getABITypeSizeInBits(Ty);
 }
 
 /// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
@@ -963,12 +1013,22 @@
     Instruction *User = cast<Instruction>(*UI);
     
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // FIXME: Loads of a first class aggregrate value could be converted to a
+      // series of loads and insertvalues
+      if (!LI->getType()->isSingleValueType())
+        return 0;
+
       if (MergeInType(LI->getType(), UsedType, TD))
         return 0;
       
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Storing the pointer, not into the value?
       if (SI->getOperand(0) == V) return 0;
+
+      // FIXME: Stores of a first class aggregrate value could be converted to a
+      // series of extractvalues and stores
+      if (!SI->getOperand(0)->getType()->isSingleValueType())
+        return 0;
       
       // NOTE: We could handle storing of FP imms into integers here!
       
@@ -1204,11 +1264,11 @@
   // We do this to support (f.e.) loads off the end of a structure where
   // only some bits are used.
   if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
-    NV = BinaryOperator::createLShr(NV, 
+    NV = BinaryOperator::CreateLShr(NV, 
                                     ConstantInt::get(NV->getType(),ShAmt),
                                     LI->getName(), LI);
   else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
-    NV = BinaryOperator::createShl(NV, 
+    NV = BinaryOperator::CreateShl(NV, 
                                    ConstantInt::get(NV->getType(),-ShAmt),
                                    LI->getName(), LI);
   
@@ -1308,12 +1368,12 @@
     // only some bits in the structure are set.
     APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
     if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-      SV = BinaryOperator::createShl(SV, 
+      SV = BinaryOperator::CreateShl(SV, 
                                      ConstantInt::get(SV->getType(), ShAmt),
                                      SV->getName(), SI);
       Mask <<= ShAmt;
     } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-      SV = BinaryOperator::createLShr(SV,
+      SV = BinaryOperator::CreateLShr(SV,
                                       ConstantInt::get(SV->getType(),-ShAmt),
                                       SV->getName(), SI);
       Mask = Mask.lshr(ShAmt);
@@ -1323,9 +1383,9 @@
     // in the new bits.
     if (SrcWidth != DestWidth) {
       assert(DestWidth > SrcWidth);
-      Old = BinaryOperator::createAnd(Old, ConstantInt::get(~Mask),
+      Old = BinaryOperator::CreateAnd(Old, ConstantInt::get(~Mask),
                                       Old->getName()+".mask", SI);
-      SV = BinaryOperator::createOr(Old, SV, SV->getName()+".ins", SI);
+      SV = BinaryOperator::CreateOr(Old, SV, SV->getName()+".ins", SI);
     }
   }
   return SV;

Removed: llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFG.cpp?rev=53162&view=auto

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFG.cpp (original)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFG.cpp (removed)
@@ -1,230 +0,0 @@
-//===- SimplifyCFG.cpp - CFG Simplification Pass --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements dead code elimination and basic block merging, along
-// with a collection of other peephole control flow optimizations.  For example:
-//
-//   * Removes basic blocks with no predecessors.
-//   * Merges a basic block into its predecessor if there is only one and the
-//     predecessor only has one successor.
-//   * Eliminates PHI nodes for basic blocks with a single predecessor.
-//   * Eliminates a basic block that only contains an unconditional branch.
-//   * Changes invoke instructions to nounwind functions to be calls.
-//   * Change things like "if (x) if (y)" into "if (x&y)".
-//   * etc..
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "simplifycfg"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/ParameterAttributes.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Pass.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-STATISTIC(NumSimpl, "Number of blocks simplified");
-
-namespace {
-  struct VISIBILITY_HIDDEN CFGSimplifyPass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CFGSimplifyPass() : FunctionPass((intptr_t)&ID) {}
-
-    virtual bool runOnFunction(Function &F);
-  };
-  char CFGSimplifyPass::ID = 0;
-  RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
-}
-
-// Public interface to the CFGSimplification pass
-FunctionPass *llvm::createCFGSimplificationPass() {
-  return new CFGSimplifyPass();
-}
-
-/// ChangeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void ChangeToUnreachable(Instruction *I) {
-  BasicBlock *BB = I->getParent();
-  // Loop over all of the successors, removing BB's entry from any PHI
-  // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
-  
-  new UnreachableInst(I);
-  
-  // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
-  while (BBI != BBE) {
-    if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
-    BB->getInstList().erase(BBI++);
-  }
-}
-
-/// ChangeToCall - Convert the specified invoke into a normal call.
-static void ChangeToCall(InvokeInst *II) {
-  BasicBlock *BB = II->getParent();
-  SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(),
-                                       Args.end(), "", II);
-  NewCall->takeName(II);
-  NewCall->setCallingConv(II->getCallingConv());
-  NewCall->setParamAttrs(II->getParamAttrs());
-  II->replaceAllUsesWith(NewCall);
-
-  // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
-
-  // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(BB);
-  BB->getInstList().erase(II);
-}
-
-static bool MarkAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
-  
-  SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(BB);
-  bool Changed = false;
-  while (!Worklist.empty()) {
-    BB = Worklist.back();
-    Worklist.pop_back();
-    
-    if (!Reachable.insert(BB))
-      continue;
-
-    // Do a quick scan of the basic block, turning any obviously unreachable
-    // instructions into LLVM unreachable insts.  The instruction combining pass
-    // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
-        if (CI->doesNotReturn()) {
-          // If we found a call to a no-return function, insert an unreachable
-          // instruction after it.  Make sure there isn't *already* one there
-          // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
-            ChangeToUnreachable(BBI);
-            Changed = true;
-          }
-          break;
-        }
-      }
-      
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
-        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
-            isa<UndefValue>(SI->getOperand(1))) {
-          ChangeToUnreachable(SI);
-          Changed = true;
-          break;
-        }
-    }
-
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
-      if (II->doesNotThrow()) {
-        ChangeToCall(II);
-        Changed = true;
-      }
-
-    Changed |= ConstantFoldTerminator(BB);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      Worklist.push_back(*SI);
-  }
-  return Changed;
-}
-
-/// RemoveUnreachableBlocks - Remove blocks that are not reachable, even if they
-/// are in a dead cycle.  Return true if a change was made, false otherwise.
-static bool RemoveUnreachableBlocks(Function &F) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
-  
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-  
-  assert(Reachable.size() < F.size());
-  NumSimpl += F.size()-Reachable.size();
-  
-  // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
-      continue;
-    
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
-    BB->dropAllReferences();
-  }
-  
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
-      I = F.getBasicBlockList().erase(I);
-    else
-      ++I;
-  
-  return true;
-}
-
-/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
-/// iterating until no more changes are made.
-static bool IterativeSimplifyCFG(Function &F) {
-  bool Changed = false;
-  bool LocalChange = true;
-  while (LocalChange) {
-    LocalChange = false;
-    
-    // Loop over all of the basic blocks (except the first one) and remove them
-    // if they are unneeded...
-    //
-    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++)) {
-        LocalChange = true;
-        ++NumSimpl;
-      }
-    }
-    Changed |= LocalChange;
-  }
-  return Changed;
-}
-
-// It is possible that we may require multiple passes over the code to fully
-// simplify the CFG.
-//
-bool CFGSimplifyPass::runOnFunction(Function &F) {
-  bool EverChanged = RemoveUnreachableBlocks(F);
-  EverChanged |= IterativeSimplifyCFG(F);
-  
-  // If neither pass changed anything, we're done.
-  if (!EverChanged) return false;
-
-  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // RemoveUnreachableBlocks is needed to nuke them, which means we should
-  // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of 
-  // RemoveUnreachableBlocks doesn't do anything.
-  if (!RemoveUnreachableBlocks(F))
-    return true;
-  
-  do {
-    EverChanged = IterativeSimplifyCFG(F);
-    EverChanged |= RemoveUnreachableBlocks(F);
-  } while (EverChanged);
-  
-  return true;
-}

Added: llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFGPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFGPass.cpp?rev=53163&view=auto

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFGPass.cpp (added)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyCFGPass.cpp Sun Jul  6 15:45:41 2008
@@ -0,0 +1,231 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ParameterAttributes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGSimplifyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGSimplifyPass() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+  };
+}
+
+char CFGSimplifyPass::ID = 0;
+static RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass() {
+  return new CFGSimplifyPass();
+}
+
+/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void ChangeToUnreachable(Instruction *I) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+  
+  new UnreachableInst(I);
+  
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// ChangeToCall - Convert the specified invoke into a normal call.
+static void ChangeToCall(InvokeInst *II) {
+  BasicBlock *BB = II->getParent();
+  SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(),
+                                       Args.end(), "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setParamAttrs(II->getParamAttrs());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(BB);
+  BB->getInstList().erase(II);
+}
+
+static bool MarkAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+  
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(BB);
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    if (!Reachable.insert(BB))
+      continue;
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            ChangeToUnreachable(BBI);
+            Changed = true;
+          }
+          break;
+        }
+      }
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
+            isa<UndefValue>(SI->getOperand(1))) {
+          ChangeToUnreachable(SI);
+          Changed = true;
+          break;
+        }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        ChangeToCall(II);
+        Changed = true;
+      }
+
+    Changed |= ConstantFoldTerminator(BB);
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      Worklist.push_back(*SI);
+  }
+  return Changed;
+}
+
+/// RemoveUnreachableBlocks - Remove blocks that are not reachable, even if they
+/// are in a dead cycle.  Return true if a change was made, false otherwise.
+static bool RemoveUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+  
+  assert(Reachable.size() < F.size());
+  NumSimpl += F.size()-Reachable.size();
+  
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
+      continue;
+    
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
+  }
+  
+  for (Function::iterator I = ++F.begin(); I != F.end();)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+  
+  return true;
+}
+
+/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool IterativeSimplifyCFG(Function &F) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+    
+    // Loop over all of the basic blocks (except the first one) and remove them
+    // if they are unneeded...
+    //
+    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(BBIt++)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+// It is possible that we may require multiple passes over the code to fully
+// simplify the CFG.
+//
+bool CFGSimplifyPass::runOnFunction(Function &F) {
+  bool EverChanged = RemoveUnreachableBlocks(F);
+  EverChanged |= IterativeSimplifyCFG(F);
+  
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // RemoveUnreachableBlocks is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid reruning IterativeSimplifyCFG if the second pass of 
+  // RemoveUnreachableBlocks doesn't do anything.
+  if (!RemoveUnreachableBlocks(F))
+    return true;
+  
+  do {
+    EverChanged = IterativeSimplifyCFG(F);
+    EverChanged |= RemoveUnreachableBlocks(F);
+  } while (EverChanged);
+  
+  return true;
+}

Added: llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyLibCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyLibCalls.cpp?rev=53163&view=auto

==============================================================================
--- llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyLibCalls.cpp (added)
+++ llvm/branches/non-call-eh/lib/Transforms/Scalar/SimplifyLibCalls.cpp Sun Jul  6 15:45:41 2008
@@ -0,0 +1,1444 @@
+//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies a variety of small
+// optimizations for calls to specific well-known function calls (e.g. runtime
+// library functions). For example, a call to the function "exit(3)" that
+// occurs within the main() function can be transformed into a simple "return 3"
+// instruction. Any optimization that takes this form (replace call to library
+// function with simpler code that provides the same result) belongs in this
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
+//===----------------------------------------------------------------------===//
+// Optimizer Base Class
+//===----------------------------------------------------------------------===//
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class VISIBILITY_HIDDEN LibCallOptimization {
+protected:
+  Function *Caller;
+  const TargetData *TD;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// CallOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) =0;
+  
+  Value *OptimizeCall(CallInst *CI, const TargetData &TD, IRBuilder &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = &TD;
+    return CallOptimizer(CI->getCalledFunction(), CI, B);
+  }
+
+  /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+  Value *CastToCStr(Value *V, IRBuilder &B);
+
+  /// EmitStrLen - Emit a call to the strlen function to the builder, for the
+  /// specified pointer.  Ptr is required to be some pointer type, and the
+  /// return value has 'intptr_t' type.
+  Value *EmitStrLen(Value *Ptr, IRBuilder &B);
+  
+  /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This
+  /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
+  Value *EmitMemCpy(Value *Dst, Value *Src, Value *Len, 
+                    unsigned Align, IRBuilder &B);
+  
+  /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+  /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+  Value *EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder &B);
+    
+  /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+  /// 'floor').  This function is known to take a single of type matching 'Op'
+  /// and returns one value with the same type.  If 'Op' is a long double, 'l'
+  /// is added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+  Value *EmitUnaryFloatFnCall(Value *Op, const char *Name, IRBuilder &B);
+  
+  /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+  /// is an integer.
+  void EmitPutChar(Value *Char, IRBuilder &B);
+  
+  /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+  /// some pointer.
+  void EmitPutS(Value *Str, IRBuilder &B);
+    
+  /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+  /// an i32, and File is a pointer to FILE.
+  void EmitFPutC(Value *Char, Value *File, IRBuilder &B);
+  
+  /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+  /// pointer and File is a pointer to FILE.
+  void EmitFPutS(Value *Str, Value *File, IRBuilder &B);
+  
+  /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+  /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+  void EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder &B);
+    
+};
+} // End anonymous namespace.
+
+/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+Value *LibCallOptimization::CastToCStr(Value *V, IRBuilder &B) {
+  return B.CreateBitCast(V, PointerType::getUnqual(Type::Int8Ty), "cstr");
+}
+
+/// EmitStrLen - Emit a call to the strlen function to the builder, for the
+/// specified pointer.  This always returns an integer value of size intptr_t.
+Value *LibCallOptimization::EmitStrLen(Value *Ptr, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Constant *StrLen =M->getOrInsertFunction("strlen", TD->getIntPtrType(),
+                                           PointerType::getUnqual(Type::Int8Ty),
+                                           NULL);
+  return B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+}
+
+/// EmitMemCpy - Emit a call to the memcpy function to the builder.  This always
+/// expects that the size has type 'intptr_t' and Dst/Src are pointers.
+Value *LibCallOptimization::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
+                                       unsigned Align, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Intrinsic::ID IID = Len->getType() == Type::Int32Ty ?
+                           Intrinsic::memcpy_i32 : Intrinsic::memcpy_i64;
+  Value *MemCpy = Intrinsic::getDeclaration(M, IID);
+  return B.CreateCall4(MemCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Len,
+                       ConstantInt::get(Type::Int32Ty, Align));
+}
+
+/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+Value *LibCallOptimization::EmitMemChr(Value *Ptr, Value *Val,
+                                       Value *Len, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Value *MemChr = M->getOrInsertFunction("memchr",
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         Type::Int32Ty, TD->getIntPtrType(),
+                                         NULL);
+  return B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
+}
+
+/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+/// 'floor').  This function is known to take a single of type matching 'Op' and
+/// returns one value with the same type.  If 'Op' is a long double, 'l' is
+/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+Value *LibCallOptimization::EmitUnaryFloatFnCall(Value *Op, const char *Name,
+                                                 IRBuilder &B) {
+  char NameBuffer[20];
+  if (Op->getType() != Type::DoubleTy) {
+    // If we need to add a suffix, copy into NameBuffer.
+    unsigned NameLen = strlen(Name);
+    assert(NameLen < sizeof(NameBuffer)-2);
+    memcpy(NameBuffer, Name, NameLen);
+    if (Op->getType() == Type::FloatTy)
+      NameBuffer[NameLen] = 'f';  // floorf
+    else
+      NameBuffer[NameLen] = 'l';  // floorl
+    NameBuffer[NameLen+1] = 0;
+    Name = NameBuffer;
+  }
+  
+  Module *M = Caller->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, Op->getType(), 
+                                         Op->getType(), NULL);
+  return B.CreateCall(Callee, Op, Name);
+}
+
+/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+/// is an integer.
+void LibCallOptimization::EmitPutChar(Value *Char, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Value *F = M->getOrInsertFunction("putchar", Type::Int32Ty,
+                                    Type::Int32Ty, NULL);
+  B.CreateCall(F, B.CreateIntCast(Char, Type::Int32Ty, "chari"), "putchar");
+}
+
+/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+/// some pointer.
+void LibCallOptimization::EmitPutS(Value *Str, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Value *F = M->getOrInsertFunction("puts", Type::Int32Ty,
+                                    PointerType::getUnqual(Type::Int8Ty), NULL);
+  B.CreateCall(F, CastToCStr(Str, B), "puts");
+}
+
+/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+/// an integer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutC(Value *Char, Value *File, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Constant *F = M->getOrInsertFunction("fputc", Type::Int32Ty, Type::Int32Ty,
+                                       File->getType(), NULL);
+  Char = B.CreateIntCast(Char, Type::Int32Ty, "chari");
+  B.CreateCall2(F, Char, File, "fputc");
+}
+
+/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+/// pointer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutS(Value *Str, Value *File, IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Constant *F = M->getOrInsertFunction("fputs", Type::Int32Ty,
+                                       PointerType::getUnqual(Type::Int8Ty),
+                                       File->getType(), NULL);
+  B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
+}
+
+/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+void LibCallOptimization::EmitFWrite(Value *Ptr, Value *Size, Value *File,
+                                     IRBuilder &B) {
+  Module *M = Caller->getParent();
+  Constant *F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(),
+                                       PointerType::getUnqual(Type::Int8Ty),
+                                       TD->getIntPtrType(), TD->getIntPtrType(),
+                                       File->getType(), NULL);
+  B.CreateCall4(F, CastToCStr(Ptr, B), Size, 
+                ConstantInt::get(TD->getIntPtrType(), 1), File);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// GetStringLengthH - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLengthH(Value *V, SmallPtrSet<PHINode*, 32> &PHIs) {
+  // Look through noop bitcast instructions.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+    return GetStringLengthH(BCI->getOperand(0), PHIs);
+  
+  // If this is a PHI node, there are two cases: either we have already seen it
+  // or we haven't.
+  if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (!PHIs.insert(PN))
+      return ~0ULL;  // already in the set.
+    
+    // If it was new, see if all the input strings are the same length.
+    uint64_t LenSoFar = ~0ULL;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      uint64_t Len = GetStringLengthH(PN->getIncomingValue(i), PHIs);
+      if (Len == 0) return 0; // Unknown length -> unknown.
+      
+      if (Len == ~0ULL) continue;
+      
+      if (Len != LenSoFar && LenSoFar != ~0ULL)
+        return 0;    // Disagree -> unknown.
+      LenSoFar = Len;
+    }
+    
+    // Success, all agree.
+    return LenSoFar;
+  }
+  
+  // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
+  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    if (Len1 == 0) return 0;
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    if (Len2 == 0) return 0;
+    if (Len1 == ~0ULL) return Len2;
+    if (Len2 == ~0ULL) return Len1;
+    if (Len1 != Len2) return 0;
+    return Len1;
+  }
+  
+  // If the value is not a GEP instruction nor a constant expression with a
+  // GEP instruction, then return unknown.
+  User *GEP = 0;
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V)) {
+    GEP = GEPI;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() != Instruction::GetElementPtr)
+      return 0;
+    GEP = CE;
+  } else {
+    return 0;
+  }
+  
+  // Make sure the GEP has exactly three arguments.
+  if (GEP->getNumOperands() != 3)
+    return 0;
+  
+  // Check to make sure that the first operand of the GEP is an integer and
+  // has value 0 so that we are sure we're indexing into the initializer.
+  if (ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(1))) {
+    if (!Idx->isZero())
+      return 0;
+  } else
+    return 0;
+  
+  // If the second index isn't a ConstantInt, then this is a variable index
+  // into the array.  If this occurs, we can't say anything meaningful about
+  // the string.
+  uint64_t StartIdx = 0;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
+    StartIdx = CI->getZExtValue();
+  else
+    return 0;
+  
+  // The GEP instruction, constant or instruction, must reference a global
+  // variable that is a constant and is initialized. The referenced constant
+  // initializer is the array that we'll use for optimization.
+  GlobalVariable* GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
+  if (!GV || !GV->isConstant() || !GV->hasInitializer())
+    return 0;
+  Constant *GlobalInit = GV->getInitializer();
+  
+  // Handle the ConstantAggregateZero case, which is a degenerate case. The
+  // initializer is constant zero so the length of the string must be zero.
+  if (isa<ConstantAggregateZero>(GlobalInit))
+    return 1;  // Len = 0 offset by 1.
+  
+  // Must be a Constant Array
+  ConstantArray *Array = dyn_cast<ConstantArray>(GlobalInit);
+  if (!Array || Array->getType()->getElementType() != Type::Int8Ty)
+    return false;
+  
+  // Get the number of elements in the array
+  uint64_t NumElts = Array->getType()->getNumElements();
+  
+  // Traverse the constant array from StartIdx (derived above) which is
+  // the place the GEP refers to in the array.
+  for (unsigned i = StartIdx; i != NumElts; ++i) {
+    Constant *Elt = Array->getOperand(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
+    if (!CI) // This array isn't suitable, non-int initializer.
+      return 0;
+    if (CI->isZero())
+      return i-StartIdx+1; // We found end of string, success!
+  }
+  
+  return 0; // The array isn't null terminated, conservatively return 'unknown'.
+}
+
+/// GetStringLength - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLength(Value *V) {
+  if (!isa<PointerType>(V->getType())) return 0;
+  
+  SmallPtrSet<PHINode*, 32> PHIs;
+  uint64_t Len = GetStringLengthH(V, PHIs);
+  // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
+  // an empty string as a length.
+  return Len == ~0ULL ? 1 : Len;
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero. 
+static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+namespace {
+//===---------------------------------------===//
+// 'exit' Optimizations
+
+/// ExitOpt - int main() { exit(4); } --> int main() { return 4; }
+struct VISIBILITY_HIDDEN ExitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify we have a reasonable prototype for exit.
+    if (Callee->arg_size() == 0 || !CI->use_empty())
+      return 0;
+
+    // Verify the caller is main, and that the result type of main matches the
+    // argument type of exit.
+    if (!Caller->isName("main") || !Caller->hasExternalLinkage() ||
+        Caller->getReturnType() != CI->getOperand(1)->getType())
+      return 0;
+
+    TerminatorInst *OldTI = CI->getParent()->getTerminator();
+    
+    // Create the return after the call.
+    ReturnInst *RI = B.CreateRet(CI->getOperand(1));
+
+    // Drop all successor phi node entries.
+    for (unsigned i = 0, e = OldTI->getNumSuccessors(); i != e; ++i)
+      OldTI->getSuccessor(i)->removePredecessor(CI->getParent());
+    
+    // Erase all instructions from after our return instruction until the end of
+    // the block.
+    BasicBlock::iterator FirstDead = RI; ++FirstDead;
+    CI->getParent()->getInstList().erase(FirstDead, CI->getParent()->end());
+    return CI;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'strcat' Optimizations
+
+struct VISIBILITY_HIDDEN StrCatOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify the "strcat" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+    
+    // Extract some information from the instruction
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+    
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+    
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B);
+    
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Dst = B.CreateGEP(Dst, DstLen, "endptr");
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len+1), 1, B);
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strchr' Optimizations
+
+struct VISIBILITY_HIDDEN StrChrOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify the "strchr" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    Value *SrcStr = CI->getOperand(1);
+    
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    if (CharC == 0) {
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || FT->getParamType(1) != Type::Int32Ty) // memchr needs i32.
+        return 0;
+      
+      return EmitMemChr(SrcStr, CI->getOperand(2), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(), Len), B);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    std::string Str;
+    if (!GetConstantStringInfo(SrcStr, Str))
+      return 0;
+    
+    // strchr can find the nul character.
+    Str += '\0';
+    char CharValue = CharC->getSExtValue();
+    
+    // Compute the offset.
+    uint64_t i = 0;
+    while (1) {
+      if (i == Str.size())    // Didn't find the char.  strchr returns null.
+        return Constant::getNullValue(CI->getType());
+      // Did we find our match?
+      if (Str[i] == CharValue)
+        break;
+      ++i;
+    }
+    
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    Value *Idx = ConstantInt::get(Type::Int64Ty, i);
+    return B.CreateGEP(SrcStr, Idx, "strchr");
+  }
+};
+
+//===---------------------------------------===//
+// 'strcmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify the "strcmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), strcmp(Str1.c_str(),Str2.c_str()));
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'strncmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify the "strncmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+    
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(),
+                              strncmp(Str1.c_str(), Str2.c_str(), Length));
+    return 0;
+  }
+};
+
+
+//===---------------------------------------===//
+// 'strcpy' Optimizations
+
+struct VISIBILITY_HIDDEN StrCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Verify the "strcpy" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Dst = CI->getOperand(1), *Src = CI->getOperand(2);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len), 1, B);
+    return Dst;
+  }
+};
+
+
+
+//===---------------------------------------===//
+// 'strlen' Optimizations
+
+struct VISIBILITY_HIDDEN StrLenOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    Value *Src = CI->getOperand(1);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // Handle strlen(p) != 0.
+    if (!IsOnlyUsedInZeroEqualityComparison(CI)) return 0;
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'memcmp' Optimizations
+
+struct VISIBILITY_HIDDEN MemCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getReturnType() != Type::Int32Ty)
+      return 0;
+
+    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    if (Len == 1) { // memcmp(S1,S2,1) -> *LHS - *RHS
+      Value *LHSV = B.CreateLoad(CastToCStr(LHS, B), "lhsv");
+      Value *RHSV = B.CreateLoad(CastToCStr(RHS, B), "rhsv");
+      return B.CreateZExt(B.CreateSub(LHSV, RHSV, "chardiff"), CI->getType());
+    }
+
+    // memcmp(S1,S2,2) != 0 -> (*(short*)LHS ^ *(short*)RHS)  != 0
+    // memcmp(S1,S2,4) != 0 -> (*(int*)LHS ^ *(int*)RHS)  != 0
+    if ((Len == 2 || Len == 4) && IsOnlyUsedInZeroEqualityComparison(CI)) {
+      const Type *PTy = PointerType::getUnqual(Len == 2 ?
+                                               Type::Int16Ty : Type::Int32Ty);
+      LHS = B.CreateBitCast(LHS, PTy, "tmp");
+      RHS = B.CreateBitCast(RHS, PTy, "tmp");
+      LoadInst *LHSV = B.CreateLoad(LHS, "lhsv");
+      LoadInst *RHSV = B.CreateLoad(RHS, "rhsv");
+      LHSV->setAlignment(1); RHSV->setAlignment(1);  // Unaligned loads.
+      return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType());
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'memcpy' Optimizations
+
+struct VISIBILITY_HIDDEN MemCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    EmitMemCpy(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3), 1, B);
+    return CI->getOperand(1);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'pow*' Optimizations
+
+struct VISIBILITY_HIDDEN PowOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op1 = CI->getOperand(1), *Op2 = CI->getOperand(2);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B);
+    }
+    
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return 0;
+    
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+    
+    if (Op2C->isExactlyValue(0.5)) {
+      // FIXME: This is not safe for -0.0 and -inf.  This can only be done when
+      // 'unsafe' math optimizations are allowed.
+      // x    pow(x, 0.5)  sqrt(x)
+      // ---------------------------------------------
+      // -0.0    +0.0       -0.0
+      // -inf    +inf       NaN
+#if 0
+      // pow(x, 0.5) -> sqrt(x)
+      return B.CreateCall(get_sqrt(), Op1, "sqrt");
+#endif
+    }
+    
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'exp2' Optimizations
+
+struct VISIBILITY_HIDDEN Exp2Opt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    }
+    
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType() == Type::FloatTy)
+        Name = "ldexpf";
+      else if (Op->getType() == Type::DoubleTy)
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(APFloat(1.0f));
+      if (Op->getType() != Type::FloatTy)
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(), Type::Int32Ty,NULL);
+      return B.CreateCall2(Callee, One, LdExpArg);
+    }
+    return 0;
+  }
+};
+    
+
+//===---------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct VISIBILITY_HIDDEN UnaryDoubleFPOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::DoubleTy ||
+        FT->getParamType(0) != Type::DoubleTy)
+      return 0;
+    
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1));
+    if (Cast == 0 || Cast->getOperand(0)->getType() != Type::FloatTy)
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getNameStart(), B);
+    return B.CreateFPExt(V, Type::DoubleTy);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'ffs*' Optimizations
+
+struct VISIBILITY_HIDDEN FFSOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::Int32Ty ||
+        !isa<IntegerType>(FT->getParamType(0)))
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->getValue() == 0)  // ffs(0) -> 0.
+        return Constant::getNullValue(CI->getType());
+      return ConstantInt::get(Type::Int32Ty, // ffs(c) -> cttz(c)+1
+                              CI->getValue().countTrailingZeros()+1);
+    }
+    
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    const Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, &ArgType, 1);
+    Value *V = B.CreateCall(F, Op, "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(Type::Int32Ty, 1), "tmp");
+    V = B.CreateIntCast(V, Type::Int32Ty, false, "tmp");
+    
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    return B.CreateSelect(Cond, V, ConstantInt::get(Type::Int32Ty, 0));
+  }
+};
+
+//===---------------------------------------===//
+// 'isdigit' Optimizations
+
+struct VISIBILITY_HIDDEN IsDigitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateSub(Op, ConstantInt::get(Type::Int32Ty, '0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'isascii' Optimizations
+
+struct VISIBILITY_HIDDEN IsAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+  
+//===---------------------------------------===//
+// 'abs', 'labs', 'llabs' Optimizations
+
+struct VISIBILITY_HIDDEN AbsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getOperand(1);
+    Value *Pos = B.CreateICmpSGT(Op,ConstantInt::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+  
+
+//===---------------------------------------===//
+// 'toascii' Optimizations
+
+struct VISIBILITY_HIDDEN ToAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getOperand(1), ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'printf' Optimizations
+
+struct VISIBILITY_HIDDEN PrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !isa<PointerType>(FT->getParamType(0)) ||
+        !(isa<IntegerType>(FT->getReturnType()) ||
+          FT->getReturnType() == Type::VoidTy))
+      return 0;
+    
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 0);
+    
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      EmitPutChar(ConstantInt::get(Type::Int32Ty, FormatStr[0]), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr.erase(FormatStr.end()-1);
+      Constant *C = ConstantArray::get(FormatStr, true);
+      C = new GlobalVariable(C->getType(), true,GlobalVariable::InternalLinkage,
+                             C, "str", Callee->getParent());
+      EmitPutS(C, B);
+      return CI->use_empty() ? (Value*)CI : 
+                          ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+    
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(*(i8*)dst)
+    if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
+        isa<IntegerType>(CI->getOperand(2)->getType())) {
+      EmitPutChar(CI->getOperand(2), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumOperands() > 2 &&
+        isa<PointerType>(CI->getOperand(2)->getType()) &&
+        CI->use_empty()) {
+      EmitPutS(CI->getOperand(2), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'sprintf' Optimizations
+
+struct VISIBILITY_HIDDEN SPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Require two fixed pointer arguments and an integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+    
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumOperands() == 3) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+      
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), // Copy the nul byte.
+                 ConstantInt::get(TD->getIntPtrType(), FormatStr.size()+1),1,B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      Value *V = B.CreateTrunc(CI->getOperand(3), Type::Int8Ty, "char");
+      Value *Ptr = CastToCStr(CI->getOperand(1), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::Int32Ty, 1), "nul");
+      B.CreateStore(Constant::getNullValue(Type::Int8Ty), Ptr);
+      
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!isa<PointerType>(CI->getOperand(3)->getType())) return 0;
+
+      Value *Len = EmitStrLen(CI->getOperand(3), B);
+      Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(3), IncLen, 1, B);
+      
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fwrite' Optimizations
+
+struct VISIBILITY_HIDDEN FWriteOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<IntegerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getParamType(2)) ||
+        !isa<PointerType>(FT->getParamType(3)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+    
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // If this is writing one byte, turn it into fputc.
+    if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getOperand(1), B), "char");
+      EmitFPutC(Char, CI->getOperand(4), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fputs' Optimizations
+
+struct VISIBILITY_HIDDEN FPutsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !CI->use_empty())
+      return 0;
+    
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getOperand(1));
+    if (!Len) return 0;
+    EmitFWrite(CI->getOperand(1), ConstantInt::get(TD->getIntPtrType(), Len-1),
+               CI->getOperand(2), B);
+    return CI;  // Known to have no uses (see above).
+  }
+};
+
+//===---------------------------------------===//
+// 'fprintf' Optimizations
+
+struct VISIBILITY_HIDDEN FPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder &B) {
+    // Require two fixed paramters as pointers and integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // All the optimizations depend on the format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumOperands() == 3) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+      
+      EmitFWrite(CI->getOperand(2), ConstantInt::get(TD->getIntPtrType(),
+                                                     FormatStr.size()),
+                 CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> *(i8*)dst = chr
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      EmitFPutC(CI->getOperand(3), CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) -> fputs(str, F)
+      if (!isa<PointerType>(CI->getOperand(3)->getType()) || !CI->use_empty())
+        return 0;
+      EmitFPutS(CI->getOperand(3), CI->getOperand(1), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// SimplifyLibCalls Pass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This pass optimizes well known library functions from libc and libm.
+  ///
+  class VISIBILITY_HIDDEN SimplifyLibCalls : public FunctionPass {
+    StringMap<LibCallOptimization*> Optimizations;
+    // Miscellaneous LibCall Optimizations
+    ExitOpt Exit; 
+    // String and Memory LibCall Optimizations
+    StrCatOpt StrCat; StrChrOpt StrChr; StrCmpOpt StrCmp; StrNCmpOpt StrNCmp;
+    StrCpyOpt StrCpy; StrLenOpt StrLen; MemCmpOpt MemCmp; MemCpyOpt  MemCpy;
+    // Math Library Optimizations
+    PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    // Integer Optimizations
+    FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
+    ToAsciiOpt ToAscii;
+    // Formatting and IO Optimizations
+    SPrintFOpt SPrintF; PrintFOpt PrintF;
+    FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
+  public:
+    static char ID; // Pass identification
+    SimplifyLibCalls() : FunctionPass((intptr_t)&ID) {}
+
+    void InitOptimizations();
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+  };
+  char SimplifyLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyLibCalls>
+X("simplify-libcalls", "Simplify well-known library calls");
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createSimplifyLibCallsPass() {
+  return new SimplifyLibCalls(); 
+}
+
+/// Optimizations - Populate the Optimizations map with all the optimizations
+/// we know.
+void SimplifyLibCalls::InitOptimizations() {
+  // Miscellaneous LibCall Optimizations
+  Optimizations["exit"] = &Exit;
+  
+  // String and Memory LibCall Optimizations
+  Optimizations["strcat"] = &StrCat;
+  Optimizations["strchr"] = &StrChr;
+  Optimizations["strcmp"] = &StrCmp;
+  Optimizations["strncmp"] = &StrNCmp;
+  Optimizations["strcpy"] = &StrCpy;
+  Optimizations["strlen"] = &StrLen;
+  Optimizations["memcmp"] = &MemCmp;
+  Optimizations["memcpy"] = &MemCpy;
+  
+  // Math Library Optimizations
+  Optimizations["powf"] = &Pow;
+  Optimizations["pow"] = &Pow;
+  Optimizations["powl"] = &Pow;
+  Optimizations["exp2l"] = &Exp2;
+  Optimizations["exp2"] = &Exp2;
+  Optimizations["exp2f"] = &Exp2;
+  
+#ifdef HAVE_FLOORF
+  Optimizations["floor"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_CEILF
+  Optimizations["ceil"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_ROUNDF
+  Optimizations["round"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_RINTF
+  Optimizations["rint"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_NEARBYINTF
+  Optimizations["nearbyint"] = &UnaryDoubleFP;
+#endif
+  
+  // Integer Optimizations
+  Optimizations["ffs"] = &FFS;
+  Optimizations["ffsl"] = &FFS;
+  Optimizations["ffsll"] = &FFS;
+  Optimizations["abs"] = &Abs;
+  Optimizations["labs"] = &Abs;
+  Optimizations["llabs"] = &Abs;
+  Optimizations["isdigit"] = &IsDigit;
+  Optimizations["isascii"] = &IsAscii;
+  Optimizations["toascii"] = &ToAscii;
+  
+  // Formatting and IO Optimizations
+  Optimizations["sprintf"] = &SPrintF;
+  Optimizations["printf"] = &PrintF;
+  Optimizations["fwrite"] = &FWrite;
+  Optimizations["fputs"] = &FPuts;
+  Optimizations["fprintf"] = &FPrintF;
+}
+
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyLibCalls::runOnFunction(Function &F) {
+  if (Optimizations.empty())
+    InitOptimizations();
+  
+  const TargetData &TD = getAnalysis<TargetData>();
+  
+  IRBuilder Builder;
+
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I++);
+      if (!CI) continue;
+      
+      // Ignore indirect calls and calls to non-external functions.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0 || !Callee->isDeclaration() ||
+          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
+        continue;
+      
+      // Ignore unknown calls.
+      const char *CalleeName = Callee->getNameStart();
+      StringMap<LibCallOptimization*>::iterator OMI =
+        Optimizations.find(CalleeName, CalleeName+Callee->getNameLen());
+      if (OMI == Optimizations.end()) continue;
+      
+      // Set the builder to the instruction after the call.
+      Builder.SetInsertPoint(BB, I);
+      
+      // Try to optimize this call.
+      Value *Result = OMI->second->OptimizeCall(CI, TD, Builder);
+      if (Result == 0) continue;
+
+      DEBUG(DOUT << "SimplifyLibCalls simplified: " << *CI;
+            DOUT << "  into: " << *Result << "\n");
+      
+      // Something changed!
+      Changed = true;
+      ++NumSimplified;
+      
+      // Inspect the instruction after the call (which was potentially just
+      // added) next.
+      I = CI; ++I;
+      
+      if (CI != Result && !CI->use_empty()) {
+        CI->replaceAllUsesWith(Result);
+        if (!Result->hasName())
+          Result->takeName(CI);
+      }
+      CI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
+
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//
+// cos, cosf, cosl:
+//   * cos(-x)  -> cos(x)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(x**y)     -> y*log(x)
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp2(y))  -> y*log(2)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//   * log(pow(x,y)) -> y*log(x)
+//
+// lround, lroundf, lroundl:
+//   * lround(cnst) -> cnst'
+//
+// memcmp:
+//   * memcmp(x,y,l)   -> cnst
+//      (if all arguments are constant and strlen(x) <= l and strlen(y) <= l)
+//
+// memmove:
+//   * memmove(d,s,l,a) -> memcpy(d,s,l,a)
+//       (if s is a global constant array)
+//
+// pow, powf, powl:
+//   * pow(exp(x),y)  -> exp(x*y)
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// puts:
+//   * puts("") -> putchar("\n")
+//
+// round, roundf, roundl:
+//   * round(cnst) -> cnst'
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+// stpcpy:
+//   * stpcpy(str, "literal") ->
+//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
+// strrchr:
+//   * strrchr(s,c) -> reverse_offset_of_in(c,s)
+//      (if c is a constant integer and s is a constant string)
+//   * strrchr(s1,0) -> strchr(s1,0)
+//
+// strncat:
+//   * strncat(x,y,0) -> x
+//   * strncat(x,y,0) -> x (if strlen(y) = 0)
+//   * strncat(x,y,l) -> strcat(x,y) (if y and l are constants an l > strlen(y))
+//
+// strncpy:
+//   * strncpy(d,s,0) -> d
+//   * strncpy(d,s,l) -> memcpy(d,s,l,1)
+//      (if s and l are constants)
+//
+// strpbrk:
+//   * strpbrk(s,a) -> offset_in_for(s,a)
+//      (if s and a are both constant strings)
+//   * strpbrk(s,"") -> 0
+//   * strpbrk(s,a) -> strchr(s,a[0]) (if a is constant string of length 1)
+//
+// strspn, strcspn:
+//   * strspn(s,a)   -> const_int (if both args are constant)
+//   * strspn("",a)  -> 0
+//   * strspn(s,"")  -> 0
+//   * strcspn(s,a)  -> const_int (if both args are constant)
+//   * strcspn("",a) -> 0
+//   * strcspn(s,"") -> strlen(a)
+//
+// strstr:
+//   * strstr(x,x)  -> x
+//   * strstr(s1,s2) -> offset_of_s2_in(s1)
+//       (if s1 and s2 are constant strings)
+//
+// tan, tanf, tanl:
+//   * tan(atan(x)) -> x
+//
+// trunc, truncf, truncl:
+//   * trunc(cnst) -> cnst'
+//
+//