[llvm-branch-commits] [llvm-branch] r172541 [5/8] - in /llvm/branches/AMDILBackend: ./ autoconf/ bindings/ocaml/executionengine/ bindings/ocaml/llvm/ bindings/ocaml/target/ cmake/ cmake/modules/ cmake/platforms/ docs/ docs/CommandGuide/ docs/_themes/ docs/_themes/llvm-theme/ docs/_themes/llvm-theme/static/ docs/llvm-theme/ docs/llvm-theme/static/ docs/tutorial/ examples/ExceptionDemo/ examples/Fibonacci/ examples/Kaleidoscope/Chapter4/ examples/Kaleidoscope/Chapter5/ examples/Kaleidoscope/Chapter6/ examples/Kaleidoscope/Chapt...

Richard Relph Richard.Relph at amd.com
Tue Jan 15 09:16:26 PST 2013


Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.cpp Tue Jan 15 11:16:16 2013
@@ -106,7 +106,7 @@
   // from FP_ROUND:  that rounds to nearest, this rounds to zero.
   setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
 
-  // We do not currently implment this libm ops for PowerPC.
+  // We do not currently implement these libm ops for PowerPC.
   setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
   setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
   setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
@@ -361,6 +361,22 @@
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+        setTruncStoreAction(VT, InnerVT, Expand);
+      }
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+    }
+
+    for (unsigned i = (unsigned)MVT::FIRST_FP_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_FP_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+      setOperationAction(ISD::FSQRT, VT, Expand);
     }
 
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
@@ -373,6 +389,10 @@
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
     setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
 
     addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
@@ -392,6 +412,14 @@
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
   }
 
   if (Subtarget->has64BitSupport()) {
@@ -449,6 +477,21 @@
   setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    maxStoresPerMemset = 32;
+    maxStoresPerMemsetOptSize = 16;
+    maxStoresPerMemcpy = 32;
+    maxStoresPerMemcpyOptSize = 8;
+    maxStoresPerMemmove = 32;
+    maxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+    benefitFromCodePlacementOpt = true;
+  }
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -517,11 +560,15 @@
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::MTFSF:           return "PPCISD::MTFSF";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
   }
 }
 
 EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i32;
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
 }
 
 //===----------------------------------------------------------------------===//
@@ -811,14 +858,13 @@
   }
 
   // Properly sign extend the value.
-  int ShAmt = (4-ByteSize)*8;
-  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
 
   // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
   if (MaskVal == 0) return SDValue();
 
   // Finally, if this value fits in a 5 bit sext field, return it
-  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+  if (SignExtend32<5>(MaskVal) == MaskVal)
     return DAG.getTargetConstant(MaskVal, MVT::i32);
   return SDValue();
 }
@@ -1204,6 +1250,14 @@
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   const Constant *C = CP->getConstVal();
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue CPIHi =
@@ -1217,6 +1271,14 @@
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1232,8 +1294,8 @@
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
-  SDValue TgtBAHi = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOHiFlag);
-  SDValue TgtBALo = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
@@ -1441,7 +1503,7 @@
                               MachinePointerInfo(),
                               MVT::i32, false, false, 0);
 
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), 
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
                      false, false, false, 0);
 }
 
@@ -1461,7 +1523,7 @@
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
   Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
+    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
                                                              *DAG.getContext());
 
   TargetLowering::ArgListTy Args;
@@ -1684,9 +1746,13 @@
                                         DebugLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
-    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
-                                     dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
   } else {
     return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
                                        dl, DAG, InVals);
@@ -1694,7 +1760,7 @@
 }
 
 SDValue
-PPCTargetLowering::LowerFormalArguments_SVR4(
+PPCTargetLowering::LowerFormalArguments_32SVR4(
                                       SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
@@ -1911,6 +1977,334 @@
   return Chain;
 }
 
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue
+PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                                     SelectionDAG &DAG, SDValue ArgVal,
+                                     DebugLoc dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+}
+
+// Set the size that is at least reserved in caller of this function.  Tail
+// call optimized functions' reserved stack space needs to be aligned so that
+// taking the difference between two stack areas will result in an aligned
+// stack.
+void
+PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                                      unsigned nAltivecParamsAtEnd,
+                                      unsigned MinReservedArea,
+                                      bool isPPC64) const {
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+  unsigned TargetAlign
+    = DAG.getMachineFunction().getTarget().getFrameLowering()->
+        getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_64SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
+
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else
+        nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
+      }
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (ObjSize < PtrByteSize)
+        CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+
+      if (ObjSize < 8) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.  The address of the parameter was already calculated
+            // above (InVals.push_back(FIN)) to be the right-justified
+            // offset within the slot.  For this store, we need a new
+            // frame index that points at the beginning of the slot.
+            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 false, false, 0);
+          }
+
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - j;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 8 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        // Vectors are aligned.
+        ArgOffset = ((ArgOffset+15)/16)*16;
+        CurArgOffset = ArgOffset;
+        ArgOffset += 16;
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrByteSize, Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
 SDValue
 PPCTargetLowering::LowerFormalArguments_Darwin(
                                       SDValue Chain,
@@ -1987,10 +2381,12 @@
       default: llvm_unreachable("Unhandled argument type!");
       case MVT::i32:
       case MVT::f32:
-        VecArgOffset += isPPC64 ? 8 : 4;
+        VecArgOffset += 4;
         break;
       case MVT::i64:  // PPC64
       case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
         VecArgOffset += 8;
         break;
       case MVT::v4f32:
@@ -2013,7 +2409,8 @@
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
@@ -2061,10 +2458,11 @@
           else
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(),
-                                            ObjSize==1 ? MVT::i8 : MVT::i16,
-                                            false, false, 0);
+                                            MachinePointerInfo(FuncArg,
+                                              CurArgOffset),
+                                            ObjType, false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
         }
@@ -2075,8 +2473,8 @@
       }
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         // Store whatever pieces of the object are in registers
-        // to memory.  ArgVal will be address of the beginning of
-        // the object.
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
@@ -2087,7 +2485,7 @@
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(),
+                                       MachinePointerInfo(FuncArg, ArgOffset),
                                        false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -2122,18 +2520,10 @@
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
-        if (ObjectVT == MVT::i32) {
+        if (ObjectVT == MVT::i32)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
-          if (Flags.isSExt())
-            ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-          else if (Flags.isZExt())
-            ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-
-          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
-        }
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
 
         ++GPR_idx;
       } else {
@@ -2220,23 +2610,10 @@
   }
 
   // Set the size that is at least reserved in caller of this function.  Tail
-  // call optimized function's reserved stack space needs to be aligned so that
+  // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-  // Add the Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    MinReservedArea = ((MinReservedArea+15)/16)*16;
-    MinReservedArea += 16*nAltivecParamsAtEnd;
-  }
-  MinReservedArea =
-    std::max(MinReservedArea,
-             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
-  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-    getStackAlignment();
-  unsigned AlignMask = TargetAlign-1;
-  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
-  FI->setMinReservedArea(MinReservedArea);
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
@@ -2276,8 +2653,8 @@
   return Chain;
 }
 
-/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
-/// linkage area for the Darwin ABI.
+/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
+/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
 static unsigned
 CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                                      bool isPPC64,
@@ -2408,7 +2785,7 @@
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
-      (Addr << 6 >> 6) != Addr)
+      SignExtend32<26>(Addr) != Addr)
     return 0;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
@@ -2686,7 +3063,7 @@
       // Thus for a call through a function pointer, the following actions need
       // to be performed:
       //   1. Save the TOC of the caller in the TOC save area of its stack
-      //      frame (this is done in LowerCall_Darwin()).
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
       //   2. Load the address of the function entry point from the function
       //      descriptor.
       //   3. Load the TOC of the callee from the function descriptor into r2.
@@ -2776,6 +3153,15 @@
   return CallOpc;
 }
 
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
 SDValue
 PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
@@ -2791,12 +3177,32 @@
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
-    EVT VT = VA.getValVT();
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyFromReg(Chain, dl,
-                               VA.getLocReg(), VT, InFlag).getValue(1);
-    InVals.push_back(Chain.getValue(0));
-    InFlag = Chain.getValue(2);
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
   }
 
   return Chain;
@@ -2819,6 +3225,10 @@
                                  isTailCall, RegsToPass, Ops, NodeTys,
                                  PPCSubTarget);
 
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
@@ -2880,8 +3290,8 @@
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if (CallOpc == PPCISD::CALL_SVR4) {
-      // Otherwise insert NOP.
+    } else if ((CallOpc == PPCISD::CALL_SVR4) && !isLocalCall(Callee)) {
+      // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP_SVR4;
     }
   }
@@ -2923,10 +3333,16 @@
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
 
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
-    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+  }
 
   return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
                           isTailCall, Outs, OutVals, Ins,
@@ -2934,15 +3350,15 @@
 }
 
 SDValue
-PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  bool isTailCall,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const {
-  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
@@ -3042,73 +3458,455 @@
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
-  SmallVector<SDValue, 8> MemOpChains;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  bool seenFloatArg = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, j = 0, e = ArgLocs.size();
+       i != e;
+       ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+
+    if (VA.isRegLoc()) {
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue
+PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                              SDValue CallSeqStart,
+                                              ISD::ArgFlagsTy Flags,
+                                              SelectionDAG &DAG,
+                                              DebugLoc dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue
+PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with at least 48 bytes, which
+  // is reserved space for [SP][CR][LR][3 x unused].
+  // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result
+  // of this call.
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv,
+                                         Outs, OutVals, nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
 
-  bool seenFloatArg = false;
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, j = 0, e = ArgLocs.size();
-       i != e;
-       ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
     if (Flags.isByVal()) {
-      // Argument is an aggregate which is passed by value, thus we need to
-      // create a copy of it in the local variable space of the current stack
-      // frame (which is the stack frame of the caller) and pass the address of
-      // this copy to the callee.
-      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
-      CCValAssign &ByValVA = ByValArgLocs[j++];
-      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
 
-      // Memory reserved in the local variable space of the callers stack frame.
-      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
 
-      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
-      // Create a copy of the argument in the local area of the current
-      // stack frame.
-      SDValue MemcpyCall =
-        CreateCopyOfByValArgument(Arg, PtrOff,
-                                  CallSeqStart.getNode()->getOperand(0),
-                                  Flags, DAG, dl);
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
 
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
-                             NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                        PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
 
-      // Pass the address of the aggregate copy on the stack either in a
-      // physical register or in the parameter list area of the current stack
-      // frame to the callee.
-      Arg = PtrOff;
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the 
+        // parameter save area instead of a new local variable.
+        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
     }
 
-    if (VA.isRegLoc()) {
-      seenFloatArg |= VA.getLocVT().isFloatingPoint();
-      // Put argument in a physical register.
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      // Put argument in the parameter list area of the current stack frame.
-      assert(VA.isMemLoc());
-      unsigned LocMemOffset = VA.getLocMemOffset();
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
-      if (!isTailCall) {
-        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+        if (isVarArg) {
+          // A single float or an aggregate containing only a single float
+          // must be passed right-justified in the stack doubleword, and
+          // in the GPR, if one is available.
+          SDValue StoreOff;
+          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+          } else
+            StoreOff = PtrOff;
 
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
+          SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else if (GPR_idx != NumGPRs)
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          ++GPR_idx;
       } else {
-        // Calculate and remember argument location.
-        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
-                                 TailCallArguments);
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32) {
+          SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
       }
+      break;
     }
   }
 
@@ -3116,12 +3914,24 @@
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call with floating args passed in
-  // registers.
-  if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
-                                     dl, MVT::i32), 0);
-    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+                         false, false, 0);
+    // R12 must contain the address of an indirect callee.  This does not
+    // mean the MTCTR instruction must use R12; it's easier to model this
+    // as an extra parameter, so do that.
+    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3134,8 +3944,8 @@
   }
 
   if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
-                    false, TailCallArguments);
+    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
 
   return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
                     RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
@@ -3152,7 +3962,7 @@
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
-  unsigned NumOps  = Outs.size();
+  unsigned NumOps = Outs.size();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
@@ -3259,11 +4069,13 @@
     }
 
     // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
     if (Flags.isByVal()) {
       unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
       if (Size==1 || Size==2) {
-        // Very small objects are passed right-justified.
-        // Everything else is passed left-justified.
         EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
@@ -3274,17 +4086,12 @@
 
           ArgOffset += PtrByteSize;
         } else {
-          SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
           SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
-          SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
-                                CallSeqStart.getNode()->getOperand(0),
-                                Flags, DAG, dl);
-          // This must go outside the CALLSEQ_START..END.
-          SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                               CallSeqStart.getNode()->getOperand(1));
-          DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
-                                 NewCallSeqStart.getNode());
-          Chain = CallSeqStart = NewCallSeqStart;
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
           ArgOffset += PtrByteSize;
         }
         continue;
@@ -3292,15 +4099,13 @@
       // Copy entire object into memory.  There are cases where gcc-generated
       // code assumes it is there, even if it could be put entirely into
       // registers.  (This is not what the doc says.)
-      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
-                            CallSeqStart.getNode()->getOperand(0),
-                            Flags, DAG, dl);
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
-      // And copy the pieces of it that fit into registers.
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
       for (unsigned j=0; j<Size; j+=PtrByteSize) {
         SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
@@ -3369,11 +4174,10 @@
               !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
             ++GPR_idx;
         }
-      } else {
+      } else
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
-      }
       if (isPPC64)
         ArgOffset += 8;
       else
@@ -3468,22 +4272,6 @@
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Check if this is an indirect call (MTCTR/BCTRL).
-  // See PrepareCall() for more information about calls through function
-  // pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall && isPPC64 && PPCSubTarget.isSVR4ABI() &&
-      !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG)) {
-    // Load r2 into a virtual register and store it to the TOC save area.
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
-    // TOC save area offset.
-    SDValue PtrOff = DAG.getIntPtrConstant(40);
-    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
-                         false, false, 0);
-  }
-
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
   // an extra parameter, so do that.
@@ -3548,8 +4336,24 @@
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -3781,7 +4585,52 @@
     return SDValue();
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0));
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
     SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
     if (Op.getValueType() == MVT::f32)
       FP = DAG.getNode(ISD::FP_ROUND, dl,
@@ -4126,7 +4975,7 @@
     unsigned TypeShiftAmt = i & (SplatBitSize-1);
 
     // vsplti + shl self.
-    if (SextVal == (i << (int)TypeShiftAmt)) {
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
@@ -4171,17 +5020,17 @@
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
     }
@@ -5630,6 +6479,14 @@
     case 'v':
     case 'y':
       return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -5672,6 +6529,9 @@
   case 'y':
     weight = CW_Register;
     break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
   }
   return weight;
 }
@@ -5688,9 +6548,9 @@
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::i32)
         return std::make_pair(0U, &PPC::F4RCRegClass);
-      if (VT == MVT::f64)
+      if (VT == MVT::f64 || VT == MVT::i64)
         return std::make_pair(0U, &PPC::F8RCRegClass);
       break;
     case 'v':
@@ -5870,7 +6730,8 @@
   bool is31 = (getTargetMachine().Options.DisableFramePointerElim(MF) ||
                MFI->hasVarSizedObjects()) &&
                   MFI->getStackSize() &&
-                  !MF.getFunction()->hasFnAttr(Attribute::Naked);
+                  !MF.getFunction()->getFnAttributes().
+                    hasAttribute(Attributes::Naked);
   unsigned FrameReg = isPPC64 ? (is31 ? PPC::X31 : PPC::X1) :
                                 (is31 ? PPC::R31 : PPC::R1);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCISelLowering.h Tue Jan 15 11:16:16 2013
@@ -174,6 +174,10 @@
       ///   operand #3 optional in flag
       TC_RETURN,
 
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 
@@ -463,20 +467,41 @@
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue
+      extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
+                        SDValue ArgVal, DebugLoc dl) const;
+
+    void
+      setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                         unsigned nAltivecParamsAtEnd,
+                         unsigned MinReservedArea, bool isPPC64) const;
+
+    SDValue
       LowerFormalArguments_Darwin(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   DebugLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-      LowerFormalArguments_SVR4(SDValue Chain,
-                                CallingConv::ID CallConv, bool isVarArg,
-                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                                DebugLoc dl, SelectionDAG &DAG,
-                                SmallVectorImpl<SDValue> &InVals) const;
+      LowerFormalArguments_64SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_32SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                 SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+                                 SelectionDAG &DAG, DebugLoc dl) const;
 
     SDValue
-      LowerCall_Darwin(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
                        bool isVarArg, bool isTailCall,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
@@ -484,13 +509,22 @@
                        DebugLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-    LowerCall_SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                   bool isVarArg, bool isTailCall,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   const SmallVectorImpl<SDValue> &OutVals,
-                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                   DebugLoc dl, SelectionDAG &DAG,
-                   SmallVectorImpl<SDValue> &InVals) const;
+      LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                     bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const;
   };
 }
 

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstr64Bit.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstr64Bit.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstr64Bit.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstr64Bit.td Tue Jan 15 11:16:16 2013
@@ -29,6 +29,9 @@
   let PrintMethod = "printSymbolLo";
   let EncoderMethod = "getLO16Encoding";
 }
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -60,7 +63,7 @@
 //
 
 let Defs = [LR8] in
-  def MovePCtoLR8 : Pseudo<(outs), (ins), "", []>,
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
                     PPC970_Unit_BRU;
 
 // Darwin ABI Calls.
@@ -138,31 +141,31 @@
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_ADD_I64",
       [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_SUB_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_SUB_I64",
       [(set G8RC:$dst, (atomic_load_sub_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_OR_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_OR_I64",
       [(set G8RC:$dst, (atomic_load_or_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_XOR_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_XOR_I64",
       [(set G8RC:$dst, (atomic_load_xor_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_AND_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_AND_i64",
       [(set G8RC:$dst, (atomic_load_and_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_NAND_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_NAND_I64",
       [(set G8RC:$dst, (atomic_load_nand_64 xoaddr:$ptr, G8RC:$incr))]>;
 
     def ATOMIC_CMP_SWAP_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new), "#ATOMIC_CMP_SWAP_I64",
       [(set G8RC:$dst, 
                     (atomic_cmp_swap_64 xoaddr:$ptr, G8RC:$old, G8RC:$new))]>;
 
     def ATOMIC_SWAP_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new), "#ATOMIC_SWAP_I64",
       [(set G8RC:$dst, (atomic_swap_64 xoaddr:$ptr, G8RC:$new))]>;
   }
 }
@@ -231,10 +234,10 @@
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   let Defs = [CTR8], Uses = [CTR8] in {
-    def BDZ8  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdz $dst",  BrB, []>;
-    def BDNZ8 : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdnz $dst", BrB, []>;
+    def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdz $dst">;
+    def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdnz $dst">;
   }
 }
 
@@ -244,7 +247,7 @@
             PPC970_MicroCode, PPC970_Unit_CRU;
 
 def MFCR8pseud: XFXForm_3<31, 19, (outs G8RC:$rT), (ins crbitm:$FXM),
-                       "", SprMFCR>,
+                       "#MFCR8pseud", SprMFCR>,
             PPC970_MicroCode, PPC970_Unit_CRU;
             
 def MFCR8 : XFXForm_3<31, 19, (outs G8RC:$rT), (ins),
@@ -275,7 +278,7 @@
 // the POWER3.
 
 let Defs = [X1], Uses = [X1] in
-def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"",
+def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set G8RC:$result,
                              (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>;
 
@@ -296,12 +299,14 @@
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
                       "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+}
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
@@ -459,7 +464,7 @@
 
 let Defs = [CARRY] in {
 def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
-                      "sradi $rA, $rS, $SH", IntRotateD,
+                      "sradi $rA, $rS, $SH", IntRotateDI,
                       [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
 }
 def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
@@ -482,23 +487,23 @@
 let isCommutable = 1 in {
 def RLDIMI : MDForm_1<30, 3,
                       (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64, RegConstraint<"$rSi = $rA">,
                       NoEncode<"$rSi">;
 }
 
 // Rotate instructions.
 def RLDCL  : MDForm_1<30, 0,
-                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MB),
-                      "rldcl $rA, $rS, $rB, $MB", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MBE),
+                      "rldcl $rA, $rS, $rB, $MBE", IntRotateD,
                       []>, isPPC64;
 def RLDICL : MDForm_1<30, 0,
-                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
                       []>, isPPC64;
 def RLDICR : MDForm_1<30, 1,
-                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
-                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicr $rA, $rS, $SH, $MBE", IntRotateDI,
                       []>, isPPC64;
 
 def RLWINM8 : MForm_2<21,
@@ -506,7 +511,7 @@
                      "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
                      []>;
 
-def ISEL8   : AForm_1<31, 15,
+def ISEL8   : AForm_4<31, 15,
                      (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB, pred:$cond),
                      "isel $rT, $rA, $rB, $cond", IntGeneral,
                      []>;
@@ -541,19 +546,19 @@
 let mayLoad = 1 in
 def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
                             ptr_rc:$rA),
-                    "lhau $rD, $disp($rA)", LdStLoad,
+                    "lhau $rD, $disp($rA)", LdStLHAU,
                     []>, RegConstraint<"$rA = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
 def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lhaux $rD, $addr", LdStLoad,
+                    "lhaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">;
-def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+def LWAUX : XForm_1<31, 373, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lwaux $rD, $addr", LdStLoad,
+                    "lwaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">, isPPC64;
 }
@@ -584,31 +589,31 @@
 // Update forms.
 let mayLoad = 1 in {
 def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStLoad,
+                    "lbzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStLoad,
+                    "lhzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStLoad,
+                    "lwzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
 def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
-def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
+def LHZUX8 : XForm_1<31, 311, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -621,18 +626,26 @@
                     "ld $rD, $src", LdStLD,
                     [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
 def LDtoc: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
-                  "",
+                  "#LDtoc",
                   [(set G8RC:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocJTI",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tjumptable:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocCPT",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64;
 
 let hasSideEffects = 1 in { 
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDinto_toc: DSForm_1<58, 0, (outs), (ins G8RC:$reg),
+let RST = 2, DS = 2 in
+def LDinto_toc: DSForm_1a<58, 0, (outs), (ins G8RC:$reg),
                     "ld 2, 8($reg)", LdStLD,
                     [(PPCload_toc G8RC:$reg)]>, isPPC64;
                     
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDtoc_restore : DSForm_1<58, 0, (outs), (ins),
+let RST = 2, DS = 10, RA = 1 in
+def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
                     "ld 2, 40(1)", LdStLD,
                     [(PPCtoc_restore)]>, isPPC64;
 }
@@ -642,13 +655,13 @@
                    
 let mayLoad = 1 in
 def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
-                    "ldu $rD, $addr", LdStLD,
+                    "ldu $rD, $addr", LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
 def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "ldux $rD, $addr", LdStLoad,
+                   "ldux $rD, $addr", LdStLDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
 }
@@ -693,16 +706,16 @@
 
 let PPC970_Unit = 2 in {
 
-def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+def STBU8 : DForm_1a<39, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
@@ -710,7 +723,7 @@
 
 def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
                                           iaddroff:$ptroff))]>,
@@ -718,7 +731,7 @@
 
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
-                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTDU,
                     [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
@@ -727,7 +740,7 @@
 
 def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti8 G8RC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -736,7 +749,7 @@
 
 def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti16 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -745,7 +758,7 @@
 
 def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti32 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -754,7 +767,7 @@
 
 def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stdux $rS, $ptroff, $ptrreg", LdStSTDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrAltivec.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrAltivec.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrAltivec.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrAltivec.td Tue Jan 15 11:16:16 2013
@@ -340,6 +340,28 @@
                       "vctuxs $vD, $vB, $UIMM", VecFP,
                       [(set VRRC:$vD,
                              (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfsx $vD, $vB, 0", VecFP,
+                       [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfsx VRRC:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs VRRC:$vD), (ins VRRC:$vB),
+                        "vctuxs $vD, $vB, 0", VecFP,
+                        [(set VRRC:$vD,
+                               (int_ppc_altivec_vctuxs VRRC:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfux $vD, $vB, 0", VecFP,
+                       [(set VRRC:$vD,
+                               (int_ppc_altivec_vcfux VRRC:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs VRRC:$vD), (ins VRRC:$vB),
+                      "vctsxs $vD, $vB, 0", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctsxs VRRC:$vB, 0))]>;
+}
 def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>;
 def VLOGEFP  : VX2_Int<458, "vlogefp",  int_ppc_altivec_vlogefp>;
 
@@ -689,3 +711,13 @@
           (v8i16 (VSRAH VRRC:$vA, VRRC:$vB))>;
 def : Pat<(v4i32 (sra (v4i32 VRRC:$vA), (v4i32 VRRC:$vB))),
           (v4i32 (VSRAW VRRC:$vA, VRRC:$vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint (v4f32 VRRC:$vA))),
+           (VCTSXS_0 VRRC:$vA)>;
+def : Pat<(v4i32 (fp_to_uint (v4f32 VRRC:$vA))),
+           (VCTUXS_0 VRRC:$vA)>;
+def : Pat<(v4f32 (sint_to_fp (v4i32 VRRC:$vA))),
+           (VCFSX_0 VRRC:$vA)>;
+def : Pat<(v4f32 (uint_to_fp (v4i32 VRRC:$vA))),
+           (VCFUX_0 VRRC:$vA)>;

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrFormats.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrFormats.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrFormats.td Tue Jan 15 11:16:16 2013
@@ -94,12 +94,6 @@
   let Inst{31}    = lk;
 }
 
-class IForm_ext<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
-            string asmstr, InstrItinClass itin, list<dag> pattern>
-         : IForm<opcode, aa, lk, OOL, IOL, asmstr, itin, pattern> {
-  let LI{0-4} = bo;
-}
-
 // 1.7.2 B-Form
 class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   : I<opcode, OOL, IOL, asmstr, BrB> {
@@ -118,6 +112,13 @@
   let Inst{31}    = lk;
 }
 
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+             string asmstr>
+  : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+  let BIBO{4-0} = bo;
+  let BIBO{6-5} = 0;
+  let CR = 0;
+}
 
 // 1.7.4 D-Form
 class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -625,9 +626,9 @@
                 InstrItinClass itin> 
   : I<opcode, OOL, IOL, asmstr, itin> {
   bits<8>  FXM;
-  bits<5>  ST;
+  bits<5>  rS;
    
-  let Inst{6-10}  = ST;
+  let Inst{6-10}  = rS;
   let Inst{11}    = 0;
   let Inst{12-19} = FXM;
   let Inst{20}    = 0;
@@ -666,7 +667,7 @@
                       string cstr, InstrItinClass itin, list<dag>pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bits<8> FM;
-  bits<5> RT;
+  bits<5> rT;
 
   bit RC = 0;    // set by isDOT
   let Pattern = pattern;
@@ -675,7 +676,7 @@
   let Inst{6} = 0;
   let Inst{7-14}  = FM;
   let Inst{15} = 0;
-  let Inst{16-20} = RT;
+  let Inst{16-20} = rT;
   let Inst{21-30} = xo;
   let Inst{31}    = RC;
 }
@@ -758,6 +759,26 @@
   let FRB = 0;
 }
 
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO (must be 12).
+  bits<3> CR;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-23} = CR;
+  let Inst{24-25} = BIBO{6-5};
+  let Inst{26-30} = xo;
+  let Inst{31}    = 0;
+}
+
 // 1.7.13 M-Form
 class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.cpp Tue Jan 15 11:16:16 2013
@@ -54,7 +54,8 @@
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) {
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
     return new PPCScoreboardHazardRecognizer(II, DAG);
   }
@@ -70,7 +71,8 @@
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   // Most subtargets use a PPC970 recognizer.
-  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) {
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
     const TargetInstrInfo *TII = TM.getInstrInfo();
     assert(TII && "No InstrInfo?");
 
@@ -568,12 +570,15 @@
     // STVX VAL, 0, R0
     //
     // FIXME: We use R0 here, because it isn't available for RA.
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
+    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
+    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
                                        FrameIdx, 0, 0));
     NewMIs.push_back(BuildMI(MF, DL, get(PPC::STVX))
                      .addReg(SrcReg, getKillRegState(isKill))
-                     .addReg(PPC::R0)
-                     .addReg(PPC::R0));
+                     .addReg(GPR0)
+                     .addReg(GPR0));
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -705,10 +710,13 @@
     // Dest = LVX 0, R0
     //
     // FIXME: We use R0 here, because it isn't available for RA.
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
+    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
+    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
                                        FrameIdx, 0, 0));
-    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(PPC::R0)
-                     .addReg(PPC::R0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(GPR0)
+                     .addReg(GPR0));
   } else {
     llvm_unreachable("Unknown regclass!");
   }

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCInstrInfo.td Tue Jan 15 11:16:16 2013
@@ -123,9 +123,11 @@
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
-                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
 def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                            [SDNPHasChain, SDNPSideEffect,
+                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
@@ -153,6 +155,12 @@
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 // Instructions to support atomic operations
 def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -330,9 +338,6 @@
   let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
-def tocentry : Operand<iPTR> {
-  let MIOperandInfo = (ops i32imm:$imm);
-}
 
 // PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
 // that doesn't matter.
@@ -364,9 +369,9 @@
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
                               [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "",
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
@@ -375,7 +380,7 @@
 }
 
 let Defs = [R1], Uses = [R1] in
-def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "",
+def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set GPRC:$result,
                              (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>;
                          
@@ -384,19 +389,19 @@
 let usesCustomInserter = 1,    // Expanded after instruction selection.
     PPC970_Single = 1 in {
   def SELECT_CC_I4 : Pseudo<(outs GPRC:$dst), (ins CRRC:$cond, GPRC:$T, GPRC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_I4",
                               []>;
   def SELECT_CC_I8 : Pseudo<(outs G8RC:$dst), (ins CRRC:$cond, G8RC:$T, G8RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_I8",
                               []>;
   def SELECT_CC_F4  : Pseudo<(outs F4RC:$dst), (ins CRRC:$cond, F4RC:$T, F4RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_F4",
                               []>;
   def SELECT_CC_F8  : Pseudo<(outs F8RC:$dst), (ins CRRC:$cond, F8RC:$T, F8RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_F8",
                               []>;
   def SELECT_CC_VRRC: Pseudo<(outs VRRC:$dst), (ins CRRC:$cond, VRRC:$T, VRRC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_VRRC",
                               []>;
 }
 
@@ -404,16 +409,16 @@
 // scavenge a register for it.
 let mayStore = 1 in
 def SPILL_CR : Pseudo<(outs), (ins CRRC:$cond, memri:$F),
-                     "", []>;
+                     "#SPILL_CR", []>;
 
 // RESTORE_CR - Indicate that we're restoring the CR register (previously
 // spilled), so we'll need to scavenge a register for it.
 let mayLoad = 1 in
 def RESTORE_CR : Pseudo<(outs CRRC:$cond), (ins memri:$F),
-                     "", []>;
+                     "#RESTORE_CR", []>;
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
-  let isReturn = 1, Uses = [LR, RM] in
+  let isCodeGenOnly = 1, isReturn = 1, Uses = [LR, RM] in
     def BLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$p),
                           "b${p:cc}lr ${p:reg}", BrB, 
                           [(retflag)]>;
@@ -422,7 +427,7 @@
 }
 
 let Defs = [LR] in
-  def MovePCtoLR : Pseudo<(outs), (ins), "", []>,
+  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
                    PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
@@ -434,16 +439,17 @@
 
   // BCC represents an arbitrary conditional branch on a predicate.
   // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
-  // a two-value operand where a dag node expects two operands. :( 
-  def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
-                  "b${cond:cc} ${cond:reg}, $dst"
-                  /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+  // a two-value operand where a dag node expects two operands. :(
+  let isCodeGenOnly = 1 in
+    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                    "b${cond:cc} ${cond:reg}, $dst"
+                    /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
 
   let Defs = [CTR], Uses = [CTR] in {
-    def BDZ  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdz $dst",  BrB, []>;
-    def BDNZ : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdnz $dst", BrB, []>;
+    def BDZ  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz $dst">;
+    def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz $dst">;
   }
 }
 
@@ -559,81 +565,81 @@
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I8",
       [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I8",
       [(set GPRC:$dst, (atomic_load_sub_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I8",
       [(set GPRC:$dst, (atomic_load_and_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I8",
       [(set GPRC:$dst, (atomic_load_or_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "ATOMIC_LOAD_XOR_I8",
       [(set GPRC:$dst, (atomic_load_xor_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I8",
       [(set GPRC:$dst, (atomic_load_nand_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_ADD_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I16",
       [(set GPRC:$dst, (atomic_load_add_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I16",
       [(set GPRC:$dst, (atomic_load_sub_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I16",
       [(set GPRC:$dst, (atomic_load_and_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I16",
       [(set GPRC:$dst, (atomic_load_or_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I16",
       [(set GPRC:$dst, (atomic_load_xor_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I16",
       [(set GPRC:$dst, (atomic_load_nand_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_ADD_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I32",
       [(set GPRC:$dst, (atomic_load_add_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I32",
       [(set GPRC:$dst, (atomic_load_sub_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I32",
       [(set GPRC:$dst, (atomic_load_and_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I32",
       [(set GPRC:$dst, (atomic_load_or_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I32",
       [(set GPRC:$dst, (atomic_load_xor_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I32",
       [(set GPRC:$dst, (atomic_load_nand_32 xoaddr:$ptr, GPRC:$incr))]>;
 
     def ATOMIC_CMP_SWAP_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I8",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_8 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
     def ATOMIC_CMP_SWAP_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_16 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
     def ATOMIC_CMP_SWAP_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_32 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
 
     def ATOMIC_SWAP_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_i8",
       [(set GPRC:$dst, (atomic_swap_8 xoaddr:$ptr, GPRC:$new))]>;
     def ATOMIC_SWAP_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I16",
       [(set GPRC:$dst, (atomic_swap_16 xoaddr:$ptr, GPRC:$new))]>;
     def ATOMIC_SWAP_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I32",
       [(set GPRC:$dst, (atomic_swap_32 xoaddr:$ptr, GPRC:$new))]>;
   }
 }
@@ -673,7 +679,7 @@
                   [(set GPRC:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
-                  "lfs $rD, $src", LdStLFDU,
+                  "lfs $rD, $src", LdStLFD,
                   [(set F4RC:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
                   "lfd $rD, $src", LdStLFD,
@@ -683,32 +689,32 @@
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1 in {
 def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStLoad,
+                   "lbzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStLoad,
+                   "lhau $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStLoad,
+                   "lhzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStLoad,
+                   "lwzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfs $rD, $addr", LdStLFDU,
+                  "lfsu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfd $rD, $addr", LdStLFD,
+                  "lfdu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -716,37 +722,37 @@
 // Indexed (r+r) Loads with Update (preinc).
 def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhaux $rD, $addr", LdStLoad,
+                   "lhaux $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
+def LHZUX : XForm_1<31, 311, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfsux $rD, $addr", LdStLoad,
+                   "lfsux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfdux $rD, $addr", LdStLoad,
+                   "lfdux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -778,10 +784,10 @@
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
-                      "lfsx $frD, $src", LdStLFDU,
+                      "lfsx $frD, $src", LdStLFD,
                       [(set F4RC:$frD, (load xaddr:$src))]>;
 def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
-                      "lfdx $frD, $src", LdStLFDU,
+                      "lfdx $frD, $src", LdStLFD,
                       [(set F8RC:$frD, (load xaddr:$src))]>;
 }
 
@@ -801,10 +807,10 @@
                    "stw $rS, $src", LdStStore,
                    [(store GPRC:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
-                   "stfs $rS, $dst", LdStUX,
+                   "stfs $rS, $dst", LdStSTFD,
                    [(store F4RC:$rS, iaddr:$dst)]>;
 def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
-                   "stfd $rS, $dst", LdStUX,
+                   "stfd $rS, $dst", LdStSTFD,
                    [(store F8RC:$rS, iaddr:$dst)]>;
 }
 
@@ -812,33 +818,33 @@
 let PPC970_Unit = 2 in {
 def STBU  : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfsu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfsu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfdu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfdu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
@@ -863,7 +869,7 @@
  
 def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti8 GPRC:$rS,
                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -872,7 +878,7 @@
  
 def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti16 GPRC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -881,7 +887,7 @@
                  
 def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -889,7 +895,7 @@
 
 def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
                               (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfsux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -897,7 +903,7 @@
 
 def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
                               (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfdux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -913,14 +919,14 @@
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfiwx $frS, $dst", LdStUX,
+                     "stfiwx $frS, $dst", LdStSTFD,
                      [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
                      
 def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
-                     "stfsx $frS, $dst", LdStUX,
+                     "stfsx $frS, $dst", LdStSTFD,
                      [(store F4RC:$frS, xaddr:$dst)]>;
 def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfdx $frS, $dst", LdStUX,
+                     "stfdx $frS, $dst", LdStSTFD,
                      [(store F8RC:$frS, xaddr:$dst)]>;
 }
 
@@ -964,7 +970,7 @@
                      [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
 }
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
                        "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
@@ -1143,6 +1149,16 @@
               "crxor $dst, $dst, $dst", BrCR,
               []>;
 
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", BrCR,
+              [(PPCcr6unset)]>;
+}
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1192,7 +1208,7 @@
 //
 // FIXME: Make this a real Pseudo instruction when the JIT switches to MC.
 def MFCRpseud: XFXForm_3<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
-                       "", SprMFCR>,
+                       "#MFCRpseud", SprMFCR>,
             PPC970_MicroCode, PPC970_Unit_CRU;
             
 def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins),
@@ -1233,7 +1249,7 @@
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def FADDrtz: AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
@@ -1364,7 +1380,7 @@
 let Uses = [RM] in {
   def FADD  : AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
   def FADDS : AForm_2<59, 21,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1379,16 +1395,16 @@
                       "fdivs $FRT, $FRA, $FRB", FPDivS,
                       [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>;
   def FMUL  : AForm_3<63, 25,
-                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fmul $FRT, $FRA, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>;
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC),
+                      "fmul $FRT, $FRA, $FRC", FPFused,
+                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRC))]>;
   def FMULS : AForm_3<59, 25,
-                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
-                      "fmuls $FRT, $FRA, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC),
+                      "fmuls $FRT, $FRA, $FRC", FPGeneral,
+                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRC))]>;
   def FSUB  : AForm_2<63, 20,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      "fsub $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
   def FSUBS : AForm_2<59, 20,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1398,7 +1414,7 @@
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
-  def ISEL  : AForm_1<31, 15,
+  def ISEL  : AForm_4<31, 15,
                      (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB, pred:$cond),
                      "isel $rT, $rA, $rB, $cond", IntGeneral,
                      []>;

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.cpp Tue Jan 15 11:16:16 2013
@@ -71,7 +71,7 @@
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii) {
+    Subtarget(ST), TII(tii), CRSpillFrameIdx(0) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -111,10 +111,15 @@
     return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
                                  CSR_Darwin32_SaveList;
 
+  // For 32-bit SVR4, also initialize the frame index associated with
+  // the CR spill slot.
+  if (!Subtarget.isPPC64())
+    CRSpillFrameIdx = 0;
+
   return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
 }
 
-const unsigned*
+const uint32_t*
 PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
@@ -477,6 +482,31 @@
   MBB.erase(II);
 }
 
+bool
+PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+				      unsigned Reg, int &FrameIdx) const {
+
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we must
+  // create exactly one stack slot and return its FrameIdx for all
+  // nonvolatiles.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (Subtarget.isPPC64()) {
+      FrameIdx = 0;
+    } else if (CRSpillFrameIdx) {
+      FrameIdx = CRSpillFrameIdx;
+    } else {
+      MachineFrameInfo *MFI = ((MachineFunction &)MF).getFrameInfo();
+      FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+      CRSpillFrameIdx = FrameIdx;
+    }
+    return true;
+  }
+  return false;
+}
+
 void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, RegScavenger *RS) const {
@@ -566,7 +596,7 @@
   // to Offset to get the correct offset.
   // Naked functions have stack size 0, although getStackSize may not reflect that
   // because we didn't call all the pieces that compute it for naked functions.
-  if (!MF.getFunction()->hasFnAttr(Attribute::Naked))
+  if (!MF.getFunction()->getFnAttributes().hasAttribute(Attributes::Naked))
     Offset += MFI->getStackSize();
 
   // If we can, encode the offset directly into the instruction.  If this is a

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCRegisterInfo.h Tue Jan 15 11:16:16 2013
@@ -30,6 +30,7 @@
   std::map<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
   const TargetInstrInfo &TII;
+  mutable int CRSpillFrameIdx;
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
@@ -43,7 +44,7 @@
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const unsigned *getCallPreservedMask(CallingConv::ID CC) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -65,6 +66,8 @@
                        int SPAdj, RegScavenger *RS) const;
   void lowerCRRestore(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+			    int &FrameIdx) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule.td Tue Jan 15 11:16:16 2013
@@ -40,6 +40,7 @@
 def IntMulLI     : InstrItinClass;
 def IntRFID      : InstrItinClass;
 def IntRotateD   : InstrItinClass;
+def IntRotateDI  : InstrItinClass;
 def IntRotate    : InstrItinClass;
 def IntShift     : InstrItinClass;
 def IntTrapD     : InstrItinClass;
@@ -52,15 +53,18 @@
 def LdStDCBF     : InstrItinClass;
 def LdStDCBI     : InstrItinClass;
 def LdStLoad     : InstrItinClass;
+def LdStLoadUpd  : InstrItinClass;
 def LdStStore    : InstrItinClass;
+def LdStStoreUpd : InstrItinClass;
 def LdStDSS      : InstrItinClass;
 def LdStICBI     : InstrItinClass;
-def LdStUX       : InstrItinClass;
 def LdStLD       : InstrItinClass;
+def LdStLDU      : InstrItinClass;
 def LdStLDARX    : InstrItinClass;
 def LdStLFD      : InstrItinClass;
 def LdStLFDU     : InstrItinClass;
 def LdStLHA      : InstrItinClass;
+def LdStLHAU     : InstrItinClass;
 def LdStLMW      : InstrItinClass;
 def LdStLVecX    : InstrItinClass;
 def LdStLWA      : InstrItinClass;
@@ -69,6 +73,9 @@
 def LdStSLBIE    : InstrItinClass;
 def LdStSTD      : InstrItinClass;
 def LdStSTDCX    : InstrItinClass;
+def LdStSTDU     : InstrItinClass;
+def LdStSTFD     : InstrItinClass;
+def LdStSTFDU    : InstrItinClass;
 def LdStSTVEBX   : InstrItinClass;
 def LdStSTWCX    : InstrItinClass;
 def LdStSync     : InstrItinClass;
@@ -86,6 +93,7 @@
 def SprRFI       : InstrItinClass;
 def SprSC        : InstrItinClass;
 def FPGeneral    : InstrItinClass;
+def FPAddSub     : InstrItinClass;
 def FPCompare    : InstrItinClass;
 def FPDivD       : InstrItinClass;
 def FPDivS       : InstrItinClass;
@@ -110,6 +118,8 @@
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
 include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction to itinerary class map - When add new opcodes to the supported
@@ -171,7 +181,7 @@
 //    extsh      IntSimple
 //    extsw      IntSimple
 //    fabs       FPGeneral
-//    fadd       FPGeneral
+//    fadd       FPAddSub
 //    fadds      FPGeneral
 //    fcfid      FPGeneral
 //    fcmpo      FPCompare
@@ -201,35 +211,35 @@
 //    fsel       FPGeneral
 //    fsqrt      FPSqrt
 //    fsqrts     FPSqrt
-//    fsub       FPGeneral
+//    fsub       FPAddSub
 //    fsubs      FPGeneral
 //    icbi       LdStICBI
 //    isync      SprISYNC
 //    lbz        LdStLoad
-//    lbzu       LdStLoad
-//    lbzux      LdStUX
+//    lbzu       LdStLoadUpd
+//    lbzux      LdStLoadUpd
 //    lbzx       LdStLoad
 //    ld         LdStLD
 //    ldarx      LdStLDARX
-//    ldu        LdStLD
-//    ldux       LdStLD
+//    ldu        LdStLDU
+//    ldux       LdStLDU
 //    ldx        LdStLD
 //    lfd        LdStLFD
 //    lfdu       LdStLFDU
 //    lfdux      LdStLFDU
-//    lfdx       LdStLFDU
-//    lfs        LdStLFDU
+//    lfdx       LdStLFD
+//    lfs        LdStLFD
 //    lfsu       LdStLFDU
 //    lfsux      LdStLFDU
-//    lfsx       LdStLFDU
+//    lfsx       LdStLFD
 //    lha        LdStLHA
-//    lhau       LdStLHA
-//    lhaux      LdStLHA
+//    lhau       LdStLHAU
+//    lhaux      LdStLHAU
 //    lhax       LdStLHA
 //    lhbrx      LdStLoad
 //    lhz        LdStLoad
-//    lhzu       LdStLoad
-//    lhzux      LdStUX
+//    lhzu       LdStLoadUpd
+//    lhzux      LdStLoadUpd
 //    lhzx       LdStLoad
 //    lmw        LdStLMW
 //    lswi       LdStLMW
@@ -243,12 +253,12 @@
 //    lvxl       LdStLVecX
 //    lwa        LdStLWA
 //    lwarx      LdStLWARX
-//    lwaux      LdStLHA
+//    lwaux      LdStLHAU
 //    lwax       LdStLHA
 //    lwbrx      LdStLoad
 //    lwz        LdStLoad
-//    lwzu       LdStLoad
-//    lwzux      LdStUX
+//    lwzu       LdStLoadUpd
+//    lwzux      LdStLoadUpd
 //    lwzx       LdStLoad
 //    mcrf       BrMCR
 //    mcrfs      FPGeneral
@@ -292,10 +302,10 @@
 //    rfid       IntRFID
 //    rldcl      IntRotateD
 //    rldcr      IntRotateD
-//    rldic      IntRotateD
-//    rldicl     IntRotateD
-//    rldicr     IntRotateD
-//    rldimi     IntRotateD
+//    rldic      IntRotateDI
+//    rldicl     IntRotateDI
+//    rldicr     IntRotateDI
+//    rldimi     IntRotateDI
 //    rlwimi     IntRotate
 //    rlwinm     IntGeneral
 //    rlwnm      IntGeneral
@@ -305,33 +315,33 @@
 //    sld        IntRotateD
 //    slw        IntGeneral
 //    srad       IntRotateD
-//    sradi      IntRotateD
+//    sradi      IntRotateDI
 //    sraw       IntShift
 //    srawi      IntShift
 //    srd        IntRotateD
 //    srw        IntGeneral
 //    stb        LdStStore
-//    stbu       LdStStore
-//    stbux      LdStStore
+//    stbu       LdStStoreUpd
+//    stbux      LdStStoreUpd
 //    stbx       LdStStore
 //    std        LdStSTD
 //    stdcx.     LdStSTDCX
-//    stdu       LdStSTD
-//    stdux      LdStSTD
+//    stdu       LdStSTDU
+//    stdux      LdStSTDU
 //    stdx       LdStSTD
-//    stfd       LdStUX
-//    stfdu      LdStUX
-//    stfdux     LdStUX
-//    stfdx      LdStUX
-//    stfiwx     LdStUX
-//    stfs       LdStUX
-//    stfsu      LdStUX
-//    stfsux     LdStUX
-//    stfsx      LdStUX
+//    stfd       LdStSTFD
+//    stfdu      LdStSTFDU
+//    stfdux     LdStSTFDU
+//    stfdx      LdStSTFD
+//    stfiwx     LdStSTFD
+//    stfs       LdStSTFD
+//    stfsu      LdStSTFDU
+//    stfsux     LdStSTFDU
+//    stfsx      LdStSTFD
 //    sth        LdStStore
 //    sthbrx     LdStStore
-//    sthu       LdStStore
-//    sthux      LdStStore
+//    sthu       LdStStoreUpd
+//    sthux      LdStStoreUpd
 //    sthx       LdStStore
 //    stmw       LdStLMW
 //    stswi      LdStLMW
@@ -344,8 +354,8 @@
 //    stw        LdStStore
 //    stwbrx     LdStStore
 //    stwcx.     LdStSTWCX
-//    stwu       LdStStore
-//    stwux      LdStStore
+//    stwu       LdStStoreUpd
+//    stwux      LdStStoreUpd
 //    stwx       LdStStore
 //    subf       IntGeneral
 //    subfc      IntGeneral

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule440.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule440.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule440.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSchedule440.td Tue Jan 15 11:16:16 2013
@@ -288,6 +288,15 @@
                                InstrStage<2, [LWB]>],
                               [9, 5],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -297,6 +306,15 @@
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -306,7 +324,7 @@
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<1, [IFTH1, IFTH2]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
                                InstrStage<1, [LRACC]>,
@@ -315,6 +333,15 @@
                                InstrStage<1, [LWB]>],
                               [8, 5, 5],
                               [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -342,6 +369,15 @@
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -371,6 +407,15 @@
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
@@ -537,6 +582,19 @@
                                InstrStage<1, [FWB]>],
                               [10, 4, 4],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleA2.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleA2.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleA2.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleA2.td Tue Jan 15 11:16:16 2013
@@ -181,6 +181,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -302,7 +313,18 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLD      , [InstrStage<4,
+  InstrItinData<LdStLoadUpd , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -324,6 +346,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStICBI    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -335,7 +368,7 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<4,
+  InstrItinData<LdStSTFD    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -346,6 +379,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7, 7],
                               [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,6 +423,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStLMW     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -412,6 +467,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -593,6 +659,17 @@
                                InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
                               [15, 7, 7],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG3.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG3.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG3.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG3.td Tue Jan 15 11:16:16 2013
@@ -34,12 +34,16 @@
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
@@ -58,6 +62,7 @@
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4.td Tue Jan 15 11:16:16 2013
@@ -33,13 +33,17 @@
   InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
@@ -60,6 +64,7 @@
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4Plus.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4Plus.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4Plus.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG4Plus.td Tue Jan 15 11:16:16 2013
@@ -36,19 +36,24 @@
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
@@ -66,6 +71,7 @@
   InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
   InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG5.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG5.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG5.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCScheduleG5.td Tue Jan 15 11:16:16 2013
@@ -27,6 +27,7 @@
   InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
   InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
   InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
@@ -37,15 +38,20 @@
   InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
   InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
@@ -53,6 +59,7 @@
   InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
   InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
@@ -69,6 +76,7 @@
   InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
   InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
   InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
   InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
   InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
   InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.cpp Tue Jan 15 11:16:16 2013
@@ -54,19 +54,26 @@
     CPUName = sys::getHostCPUName();
 #endif
 
-  // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
-
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Make sure 64-bit features are available when CPUname is generic
+  std::string FullFS = FS;
+
   // If we are generating code for ppc64, verify that options make sense.
   if (is64Bit) {
     Has64BitSupport = true;
     // Silently force 64-bit register use on ppc64.
     Use64BitRegs = true;
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
   }
-  
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
   // If the user requested use of 64-bit regs, but the cpu selected doesn't
   // support it, ignore.
   if (use64BitRegs() && !has64BitSupport())

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCSubtarget.h Tue Jan 15 11:16:16 2013
@@ -33,32 +33,34 @@
   enum {
     DIR_NONE,
     DIR_32,
-    DIR_440, 
-    DIR_601, 
-    DIR_602, 
-    DIR_603, 
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
     DIR_7400,
-    DIR_750, 
-    DIR_970, 
+    DIR_750,
+    DIR_970,
     DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
     DIR_PWR6,
     DIR_PWR7,
-    DIR_64  
+    DIR_64
   };
 }
 
 class GlobalValue;
 class TargetMachine;
-  
+
 class PPCSubtarget : public PPCGenSubtargetInfo {
 protected:
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned StackAlignment;
-  
+
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
-  
+
   /// Which cpu directive was used.
   unsigned DarwinDirective;
 
@@ -74,7 +76,7 @@
   bool IsBookE;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
-  
+
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
@@ -84,11 +86,11 @@
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
                const std::string &FS, bool is64Bit);
-  
-  /// ParseSubtargetFeatures - Parses features string setting specified 
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  
+
   /// SetJITMode - This is called to inform the subtarget info that we are
   /// producing code for the JIT.
   void SetJITMode();
@@ -97,20 +99,27 @@
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
   unsigned getStackAlignment() const { return StackAlignment; }
-  
+
   /// getDarwinDirective - Returns the -m directive specified for the cpu.
   ///
   unsigned getDarwinDirective() const { return DarwinDirective; }
-  
-  /// getInstrItins - Return the instruction itineraies based on subtarget 
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
-  /// getTargetDataString - Return the pointer size and type alignment
+  /// getDataLayoutString - Return the pointer size and type alignment
   /// properties of this subtarget.
-  const char *getTargetDataString() const {
+  const char *getDataLayoutString() const {
     // Note, the alignment values for f64 and i64 on ppc64 in Darwin
     // documentation are wrong; these are correct (i.e. "what gcc does").
+    if (isPPC64() && isSVR4ABI()) {
+      if (TargetTriple.getOS() == llvm::Triple::FreeBSD)
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:64:64-v128:128:128-n32:64";
+      else
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:128:128-v128:128:128-n32:64";
+    }
+
     return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128-n32:64"
                      : "E-p:32:32-f64:64:64-i64:64:64-f128:64:128-n32";
   }
@@ -118,22 +127,22 @@
   /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
   ///
   bool isPPC64() const { return IsPPC64; }
-  
+
   /// has64BitSupport - Return true if the selected CPU supports 64-bit
   /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
   bool has64BitSupport() const { return Has64BitSupport; }
-  
+
   /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
   /// registers in 32-bit mode when possible.  This can only true if
   /// has64BitSupport() returns true.
   bool use64BitRegs() const { return Use64BitRegs; }
-  
+
   /// hasLazyResolverStub - Return true if accesses to the specified global have
   /// to go through a dyld lazy resolution stub.  This means that an extra load
   /// is required to get the address of the global.
-  bool hasLazyResolverStub(const GlobalValue *GV, 
+  bool hasLazyResolverStub(const GlobalValue *GV,
                            const TargetMachine &TM) const;
-  
+
   // isJITCodeModel - True if we're generating code for the JIT
   bool isJITCodeModel() const { return IsJITCodeModel; }
 

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.cpp Tue Jan 15 11:16:16 2013
@@ -40,10 +40,11 @@
                                    bool is64Bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64Bit),
-    DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
+    DL(Subtarget.getDataLayoutString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    InstrItins(Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 
   // The binutils for the BG/P are too old for CFI.
   if (Subtarget.isBGP())

Modified: llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/PowerPC/PPCTargetMachine.h Tue Jan 15 11:16:16 2013
@@ -21,7 +21,8 @@
 #include "PPCISelLowering.h"
 #include "PPCSelectionDAGInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -29,13 +30,15 @@
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
   PPCSubtarget        Subtarget;
-  const TargetData    DataLayout;       // Calculates type size & alignment
+  const DataLayout    DL;       // Calculates type size & alignment
   PPCInstrInfo        InstrInfo;
   PPCFrameLowering    FrameLowering;
   PPCJITInfo          JITInfo;
   PPCTargetLowering   TLInfo;
   PPCSelectionDAGInfo TSInfo;
   InstrItineraryData  InstrItins;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
@@ -58,11 +61,17 @@
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const TargetData    *getTargetData() const    { return &DataLayout; }
+  virtual const DataLayout    *getDataLayout() const    { return &DL; }
   virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);

Modified: llvm/branches/AMDILBackend/lib/Target/README.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/README.txt?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/README.txt (original)
+++ llvm/branches/AMDILBackend/lib/Target/README.txt Tue Jan 15 11:16:16 2013
@@ -152,7 +152,7 @@
 
 //===---------------------------------------------------------------------===//
 
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
 to the type size. It works but can be overly conservative as the alignment of
 specific vector types are target dependent.
 
@@ -2367,8 +2367,3 @@
 should fold to x > y.
 
 //===---------------------------------------------------------------------===//
-
-int f(double x) { return __builtin_fabs(x) < 0.0; }
-should fold to false.
-
-//===---------------------------------------------------------------------===//

Modified: llvm/branches/AMDILBackend/lib/Target/Sparc/SparcFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Sparc/SparcFrameLowering.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Sparc/SparcFrameLowering.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/Sparc/SparcFrameLowering.cpp Tue Jan 15 11:16:16 2013
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 

Modified: llvm/branches/AMDILBackend/lib/Target/Sparc/SparcISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Sparc/SparcISelLowering.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Sparc/SparcISelLowering.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/Sparc/SparcISelLowering.cpp Tue Jan 15 11:16:16 2013
@@ -637,7 +637,7 @@
 
   PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
   Type *ElementTy = Ty->getElementType();
-  return getTargetData()->getTypeAllocSize(ElementTy);
+  return getDataLayout()->getTypeAllocSize(ElementTy);
 }
 
 //===----------------------------------------------------------------------===//

Modified: llvm/branches/AMDILBackend/lib/Target/Sparc/SparcInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Sparc/SparcInstrInfo.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Sparc/SparcInstrInfo.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/Sparc/SparcInstrInfo.td Tue Jan 15 11:16:16 2013
@@ -129,7 +129,7 @@
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
-                           [SDNPHasChain]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";

Modified: llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.cpp Tue Jan 15 11:16:16 2013
@@ -33,10 +33,10 @@
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
-    DataLayout(Subtarget.getDataLayout()),
+    DL(Subtarget.getDataLayout()),
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) {
+    FrameLowering(Subtarget), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {

Modified: llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/Sparc/SparcTargetMachine.h Tue Jan 15 11:16:16 2013
@@ -20,18 +20,21 @@
 #include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
 class SparcTargetMachine : public LLVMTargetMachine {
   SparcSubtarget Subtarget;
-  const TargetData DataLayout;       // Calculates type size & alignment
+  const DataLayout DL;       // Calculates type size & alignment
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -52,7 +55,13 @@
   virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);

Modified: llvm/branches/AMDILBackend/lib/Target/Target.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/Target.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/Target.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/Target.cpp Tue Jan 15 11:16:16 2013
@@ -16,7 +16,7 @@
 #include "llvm-c/Initialization.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include <cstring>
@@ -24,8 +24,9 @@
 using namespace llvm;
 
 void llvm::initializeTarget(PassRegistry &Registry) {
-  initializeTargetDataPass(Registry);
+  initializeDataLayoutPass(Registry);
   initializeTargetLibraryInfoPass(Registry);
+  initializeTargetTransformInfoPass(Registry);
 }
 
 void LLVMInitializeTarget(LLVMPassRegistryRef R) {
@@ -33,11 +34,11 @@
 }
 
 LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
-  return wrap(new TargetData(StringRep));
+  return wrap(new DataLayout(StringRep));
 }
 
 void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
-  unwrap(PM)->add(new TargetData(*unwrap(TD)));
+  unwrap(PM)->add(new DataLayout(*unwrap(TD)));
 }
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
@@ -55,13 +56,21 @@
 }
 
 unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
-  return unwrap(TD)->getPointerSize();
+  return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return unwrap(TD)->getPointerSize(AS);
 }
 
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
   return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
 }
 
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
+}
+
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
 }

Removed: llvm/branches/AMDILBackend/lib/Target/TargetData.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetData.cpp?rev=172540&view=auto
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetData.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetData.cpp (removed)
@@ -1,663 +0,0 @@
-//===-- TargetData.cpp - Data size & alignment routines --------------------==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines target properties related to datatype size/offset/alignment
-// information.
-//
-// This structure should be created once, filled in if the defaults are not
-// correct and then passed around by const&.  None of the members functions
-// require modification to the object.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetData.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/ADT/DenseMap.h"
-#include <algorithm>
-#include <cstdlib>
-using namespace llvm;
-
-// Handle the Pass registration stuff necessary to use TargetData's.
-
-// Register the default SparcV9 implementation...
-INITIALIZE_PASS(TargetData, "targetdata", "Target Data Layout", false, true)
-char TargetData::ID = 0;
-
-//===----------------------------------------------------------------------===//
-// Support for StructLayout
-//===----------------------------------------------------------------------===//
-
-StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
-  assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
-  StructAlignment = 0;
-  StructSize = 0;
-  NumElements = ST->getNumElements();
-
-  // Loop over each of the elements, placing them in memory.
-  for (unsigned i = 0, e = NumElements; i != e; ++i) {
-    Type *Ty = ST->getElementType(i);
-    unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
-
-    // Add padding if necessary to align the data element properly.
-    if ((StructSize & (TyAlign-1)) != 0)
-      StructSize = TargetData::RoundUpAlignment(StructSize, TyAlign);
-
-    // Keep track of maximum alignment constraint.
-    StructAlignment = std::max(TyAlign, StructAlignment);
-
-    MemberOffsets[i] = StructSize;
-    StructSize += TD.getTypeAllocSize(Ty); // Consume space for this data item
-  }
-
-  // Empty structures have alignment of 1 byte.
-  if (StructAlignment == 0) StructAlignment = 1;
-
-  // Add padding to the end of the struct so that it could be put in an array
-  // and all array elements would be aligned correctly.
-  if ((StructSize & (StructAlignment-1)) != 0)
-    StructSize = TargetData::RoundUpAlignment(StructSize, StructAlignment);
-}
-
-
-/// getElementContainingOffset - Given a valid offset into the structure,
-/// return the structure index that contains it.
-unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
-  const uint64_t *SI =
-    std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset);
-  assert(SI != &MemberOffsets[0] && "Offset not in structure type!");
-  --SI;
-  assert(*SI <= Offset && "upper_bound didn't work");
-  assert((SI == &MemberOffsets[0] || *(SI-1) <= Offset) &&
-         (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) &&
-         "Upper bound didn't work!");
-
-  // Multiple fields can have the same offset if any of them are zero sized.
-  // For example, in { i32, [0 x i32], i32 }, searching for offset 4 will stop
-  // at the i32 element, because it is the last element at that offset.  This is
-  // the right one to return, because anything after it will have a higher
-  // offset, implying that this element is non-empty.
-  return SI-&MemberOffsets[0];
-}
-
-//===----------------------------------------------------------------------===//
-// TargetAlignElem, TargetAlign support
-//===----------------------------------------------------------------------===//
-
-TargetAlignElem
-TargetAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
-                     unsigned pref_align, uint32_t bit_width) {
-  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
-  TargetAlignElem retval;
-  retval.AlignType = align_type;
-  retval.ABIAlign = abi_align;
-  retval.PrefAlign = pref_align;
-  retval.TypeBitWidth = bit_width;
-  return retval;
-}
-
-bool
-TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
-  return (AlignType == rhs.AlignType
-          && ABIAlign == rhs.ABIAlign
-          && PrefAlign == rhs.PrefAlign
-          && TypeBitWidth == rhs.TypeBitWidth);
-}
-
-const TargetAlignElem
-TargetData::InvalidAlignmentElem = { (AlignTypeEnum)0xFF, 0, 0, 0 };
-
-//===----------------------------------------------------------------------===//
-//                       TargetData Class Implementation
-//===----------------------------------------------------------------------===//
-
-/// getInt - Get an integer ignoring errors.
-static int getInt(StringRef R) {
-  int Result = 0;
-  R.getAsInteger(10, Result);
-  return Result;
-}
-
-void TargetData::init() {
-  initializeTargetDataPass(*PassRegistry::getPassRegistry());
-
-  LayoutMap = 0;
-  LittleEndian = false;
-  PointerMemSize = 8;
-  PointerABIAlign = 8;
-  PointerPrefAlign = PointerABIAlign;
-  StackNaturalAlign = 0;
-
-  // Default alignments
-  setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
-  setAlignment(INTEGER_ALIGN,   1,  1, 8);   // i8
-  setAlignment(INTEGER_ALIGN,   2,  2, 16);  // i16
-  setAlignment(INTEGER_ALIGN,   4,  4, 32);  // i32
-  setAlignment(INTEGER_ALIGN,   4,  8, 64);  // i64
-  setAlignment(FLOAT_ALIGN,     2,  2, 16);  // half
-  setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
-  setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
-  setAlignment(FLOAT_ALIGN,    16, 16, 128); // ppcf128, quad, ...
-  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32, v1i64, ...
-  setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
-  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct
-}
-
-std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
-
-  if (td)
-    td->init();
-
-  while (!Desc.empty()) {
-    std::pair<StringRef, StringRef> Split = Desc.split('-');
-    StringRef Token = Split.first;
-    Desc = Split.second;
-
-    if (Token.empty())
-      continue;
-
-    Split = Token.split(':');
-    StringRef Specifier = Split.first;
-    Token = Split.second;
-
-    assert(!Specifier.empty() && "Can't be empty here");
-
-    switch (Specifier[0]) {
-    case 'E':
-      if (td)
-        td->LittleEndian = false;
-      break;
-    case 'e':
-      if (td)
-        td->LittleEndian = true;
-      break;
-    case 'p': {
-      // Pointer size.
-      Split = Token.split(':');
-      int PointerMemSizeBits = getInt(Split.first);
-      if (PointerMemSizeBits < 0 || PointerMemSizeBits % 8 != 0)
-        return "invalid pointer size, must be a positive 8-bit multiple";
-      if (td)
-        td->PointerMemSize = PointerMemSizeBits / 8;
-
-      // Pointer ABI alignment.
-      Split = Split.second.split(':');
-      int PointerABIAlignBits = getInt(Split.first);
-      if (PointerABIAlignBits < 0 || PointerABIAlignBits % 8 != 0) {
-        return "invalid pointer ABI alignment, "
-               "must be a positive 8-bit multiple";
-      }
-      if (td)
-        td->PointerABIAlign = PointerABIAlignBits / 8;
-
-      // Pointer preferred alignment.
-      Split = Split.second.split(':');
-      int PointerPrefAlignBits = getInt(Split.first);
-      if (PointerPrefAlignBits < 0 || PointerPrefAlignBits % 8 != 0) {
-        return "invalid pointer preferred alignment, "
-               "must be a positive 8-bit multiple";
-      }
-      if (td) {
-        td->PointerPrefAlign = PointerPrefAlignBits / 8;
-        if (td->PointerPrefAlign == 0)
-          td->PointerPrefAlign = td->PointerABIAlign;
-      }
-      break;
-    }
-    case 'i':
-    case 'v':
-    case 'f':
-    case 'a':
-    case 's': {
-      AlignTypeEnum AlignType;
-      char field = Specifier[0];
-      switch (field) {
-      default:
-      case 'i': AlignType = INTEGER_ALIGN; break;
-      case 'v': AlignType = VECTOR_ALIGN; break;
-      case 'f': AlignType = FLOAT_ALIGN; break;
-      case 'a': AlignType = AGGREGATE_ALIGN; break;
-      case 's': AlignType = STACK_ALIGN; break;
-      }
-      int Size = getInt(Specifier.substr(1));
-      if (Size < 0) {
-        return std::string("invalid ") + field + "-size field, "
-               "must be positive";
-      }
-
-      Split = Token.split(':');
-      int ABIAlignBits = getInt(Split.first);
-      if (ABIAlignBits < 0 || ABIAlignBits % 8 != 0) {
-        return std::string("invalid ") + field +"-abi-alignment field, "
-               "must be a positive 8-bit multiple";
-      }
-      unsigned ABIAlign = ABIAlignBits / 8;
-
-      Split = Split.second.split(':');
-
-      int PrefAlignBits = getInt(Split.first);
-      if (PrefAlignBits < 0 || PrefAlignBits % 8 != 0) {
-        return std::string("invalid ") + field +"-preferred-alignment field, "
-               "must be a positive 8-bit multiple";
-      }
-      unsigned PrefAlign = PrefAlignBits / 8;
-      if (PrefAlign == 0)
-        PrefAlign = ABIAlign;
-      
-      if (td)
-        td->setAlignment(AlignType, ABIAlign, PrefAlign, Size);
-      break;
-    }
-    case 'n':  // Native integer types.
-      Specifier = Specifier.substr(1);
-      do {
-        int Width = getInt(Specifier);
-        if (Width <= 0) {
-          return std::string("invalid native integer size \'") + Specifier.str() +
-                 "\', must be a positive integer.";
-        }
-        if (td && Width != 0)
-          td->LegalIntWidths.push_back(Width);
-        Split = Token.split(':');
-        Specifier = Split.first;
-        Token = Split.second;
-      } while (!Specifier.empty() || !Token.empty());
-      break;
-    case 'S': { // Stack natural alignment.
-      int StackNaturalAlignBits = getInt(Specifier.substr(1));
-      if (StackNaturalAlignBits < 0 || StackNaturalAlignBits % 8 != 0) {
-        return "invalid natural stack alignment (S-field), "
-               "must be a positive 8-bit multiple";
-      }
-      if (td)
-        td->StackNaturalAlign = StackNaturalAlignBits / 8;
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  return "";
-}
-
-/// Default ctor.
-///
-/// @note This has to exist, because this is a pass, but it should never be
-/// used.
-TargetData::TargetData() : ImmutablePass(ID) {
-  report_fatal_error("Bad TargetData ctor used.  "
-                    "Tool did not specify a TargetData to use?");
-}
-
-TargetData::TargetData(const Module *M)
-  : ImmutablePass(ID) {
-  std::string errMsg = parseSpecifier(M->getDataLayout(), this);
-  assert(errMsg == "" && "Module M has malformed target data layout string.");
-  (void)errMsg;
-}
-
-void
-TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
-                         unsigned pref_align, uint32_t bit_width) {
-  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == align_type &&
-        Alignments[i].TypeBitWidth == bit_width) {
-      // Update the abi, preferred alignments.
-      Alignments[i].ABIAlign = abi_align;
-      Alignments[i].PrefAlign = pref_align;
-      return;
-    }
-  }
-
-  Alignments.push_back(TargetAlignElem::get(align_type, abi_align,
-                                            pref_align, bit_width));
-}
-
-/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or
-/// preferred if ABIInfo = false) the target wants for the specified datatype.
-unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
-                                      uint32_t BitWidth, bool ABIInfo,
-                                      Type *Ty) const {
-  // Check to see if we have an exact match and remember the best match we see.
-  int BestMatchIdx = -1;
-  int LargestInt = -1;
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == AlignType &&
-        Alignments[i].TypeBitWidth == BitWidth)
-      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
-
-    // The best match so far depends on what we're looking for.
-     if (AlignType == INTEGER_ALIGN &&
-         Alignments[i].AlignType == INTEGER_ALIGN) {
-      // The "best match" for integers is the smallest size that is larger than
-      // the BitWidth requested.
-      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
-           Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
-        BestMatchIdx = i;
-      // However, if there isn't one that's larger, then we must use the
-      // largest one we have (see below)
-      if (LargestInt == -1 ||
-          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
-        LargestInt = i;
-    }
-  }
-
-  // Okay, we didn't find an exact solution.  Fall back here depending on what
-  // is being looked for.
-  if (BestMatchIdx == -1) {
-    // If we didn't find an integer alignment, fall back on most conservative.
-    if (AlignType == INTEGER_ALIGN) {
-      BestMatchIdx = LargestInt;
-    } else {
-      assert(AlignType == VECTOR_ALIGN && "Unknown alignment type!");
-
-      // By default, use natural alignment for vector types. This is consistent
-      // with what clang and llvm-gcc do.
-      unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-      Align *= cast<VectorType>(Ty)->getNumElements();
-      // If the alignment is not a power of 2, round up to the next power of 2.
-      // This happens for non-power-of-2 length vectors.
-      if (Align & (Align-1))
-        Align = NextPowerOf2(Align);
-      return Align;
-    }
-  }
-
-  // Since we got a "best match" index, just return it.
-  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
-                 : Alignments[BestMatchIdx].PrefAlign;
-}
-
-namespace {
-
-class StructLayoutMap {
-  typedef DenseMap<StructType*, StructLayout*> LayoutInfoTy;
-  LayoutInfoTy LayoutInfo;
-
-public:
-  virtual ~StructLayoutMap() {
-    // Remove any layouts.
-    for (LayoutInfoTy::iterator I = LayoutInfo.begin(), E = LayoutInfo.end();
-         I != E; ++I) {
-      StructLayout *Value = I->second;
-      Value->~StructLayout();
-      free(Value);
-    }
-  }
-
-  StructLayout *&operator[](StructType *STy) {
-    return LayoutInfo[STy];
-  }
-
-  // for debugging...
-  virtual void dump() const {}
-};
-
-} // end anonymous namespace
-
-TargetData::~TargetData() {
-  delete static_cast<StructLayoutMap*>(LayoutMap);
-}
-
-const StructLayout *TargetData::getStructLayout(StructType *Ty) const {
-  if (!LayoutMap)
-    LayoutMap = new StructLayoutMap();
-
-  StructLayoutMap *STM = static_cast<StructLayoutMap*>(LayoutMap);
-  StructLayout *&SL = (*STM)[Ty];
-  if (SL) return SL;
-
-  // Otherwise, create the struct layout.  Because it is variable length, we
-  // malloc it, then use placement new.
-  int NumElts = Ty->getNumElements();
-  StructLayout *L =
-    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1) * sizeof(uint64_t));
-
-  // Set SL before calling StructLayout's ctor.  The ctor could cause other
-  // entries to be added to TheMap, invalidating our reference.
-  SL = L;
-
-  new (L) StructLayout(Ty, *this);
-
-  return L;
-}
-
-std::string TargetData::getStringRepresentation() const {
-  std::string Result;
-  raw_string_ostream OS(Result);
-
-  OS << (LittleEndian ? "e" : "E")
-     << "-p:" << PointerMemSize*8 << ':' << PointerABIAlign*8
-     << ':' << PointerPrefAlign*8
-     << "-S" << StackNaturalAlign*8;
-
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    const TargetAlignElem &AI = Alignments[i];
-    OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
-       << AI.ABIAlign*8 << ':' << AI.PrefAlign*8;
-  }
-
-  if (!LegalIntWidths.empty()) {
-    OS << "-n" << (unsigned)LegalIntWidths[0];
-
-    for (unsigned i = 1, e = LegalIntWidths.size(); i != e; ++i)
-      OS << ':' << (unsigned)LegalIntWidths[i];
-  }
-  return OS.str();
-}
-
-
-uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
-  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
-  switch (Ty->getTypeID()) {
-  case Type::LabelTyID:
-  case Type::PointerTyID:
-    return getPointerSizeInBits();
-  case Type::ArrayTyID: {
-    ArrayType *ATy = cast<ArrayType>(Ty);
-    return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
-  }
-  case Type::StructTyID:
-    // Get the layout annotation... which is lazily created on demand.
-    return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
-  case Type::IntegerTyID:
-    return cast<IntegerType>(Ty)->getBitWidth();
-  case Type::VoidTyID:
-    return 8;
-  case Type::HalfTyID:
-    return 16;
-  case Type::FloatTyID:
-    return 32;
-  case Type::DoubleTyID:
-  case Type::X86_MMXTyID:
-    return 64;
-  case Type::PPC_FP128TyID:
-  case Type::FP128TyID:
-    return 128;
-  // In memory objects this is always aligned to a higher boundary, but
-  // only 80 bits contain information.
-  case Type::X86_FP80TyID:
-    return 80;
-  case Type::VectorTyID:
-    return cast<VectorType>(Ty)->getBitWidth();
-  default:
-    llvm_unreachable("TargetData::getTypeSizeInBits(): Unsupported type");
-  }
-}
-
-/*!
-  \param abi_or_pref Flag that determines which alignment is returned. true
-  returns the ABI alignment, false returns the preferred alignment.
-  \param Ty The underlying type for which alignment is determined.
-
-  Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
-  == false) for the requested type \a Ty.
- */
-unsigned TargetData::getAlignment(Type *Ty, bool abi_or_pref) const {
-  int AlignType = -1;
-
-  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
-  switch (Ty->getTypeID()) {
-  // Early escape for the non-numeric types.
-  case Type::LabelTyID:
-  case Type::PointerTyID:
-    return (abi_or_pref
-            ? getPointerABIAlignment()
-            : getPointerPrefAlignment());
-  case Type::ArrayTyID:
-    return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
-
-  case Type::StructTyID: {
-    // Packed structure types always have an ABI alignment of one.
-    if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
-      return 1;
-
-    // Get the layout annotation... which is lazily created on demand.
-    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
-    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
-    return std::max(Align, Layout->getAlignment());
-  }
-  case Type::IntegerTyID:
-  case Type::VoidTyID:
-    AlignType = INTEGER_ALIGN;
-    break;
-  case Type::HalfTyID:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-  // PPC_FP128TyID and FP128TyID have different data contents, but the
-  // same size and alignment, so they look the same here.
-  case Type::PPC_FP128TyID:
-  case Type::FP128TyID:
-  case Type::X86_FP80TyID:
-    AlignType = FLOAT_ALIGN;
-    break;
-  case Type::X86_MMXTyID:
-  case Type::VectorTyID:
-    AlignType = VECTOR_ALIGN;
-    break;
-  default:
-    llvm_unreachable("Bad type for getAlignment!!!");
-  }
-
-  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSizeInBits(Ty),
-                          abi_or_pref, Ty);
-}
-
-unsigned TargetData::getABITypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, true);
-}
-
-/// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
-/// an integer type of the specified bitwidth.
-unsigned TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
-  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
-}
-
-
-unsigned TargetData::getCallFrameTypeAlignment(Type *Ty) const {
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
-    if (Alignments[i].AlignType == STACK_ALIGN)
-      return Alignments[i].ABIAlign;
-
-  return getABITypeAlignment(Ty);
-}
-
-unsigned TargetData::getPrefTypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, false);
-}
-
-unsigned TargetData::getPreferredTypeAlignmentShift(Type *Ty) const {
-  unsigned Align = getPrefTypeAlignment(Ty);
-  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
-  return Log2_32(Align);
-}
-
-/// getIntPtrType - Return an unsigned integer type that is the same size or
-/// greater to the host pointer size.
-IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
-  return IntegerType::get(C, getPointerSizeInBits());
-}
-
-
-uint64_t TargetData::getIndexedOffset(Type *ptrTy,
-                                      ArrayRef<Value *> Indices) const {
-  Type *Ty = ptrTy;
-  assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
-  uint64_t Result = 0;
-
-  generic_gep_type_iterator<Value* const*>
-    TI = gep_type_begin(ptrTy, Indices);
-  for (unsigned CurIDX = 0, EndIDX = Indices.size(); CurIDX != EndIDX;
-       ++CurIDX, ++TI) {
-    if (StructType *STy = dyn_cast<StructType>(*TI)) {
-      assert(Indices[CurIDX]->getType() ==
-             Type::getInt32Ty(ptrTy->getContext()) &&
-             "Illegal struct idx");
-      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
-
-      // Get structure layout information...
-      const StructLayout *Layout = getStructLayout(STy);
-
-      // Add in the offset, as calculated by the structure layout info...
-      Result += Layout->getElementOffset(FieldNo);
-
-      // Update Ty to refer to current element
-      Ty = STy->getElementType(FieldNo);
-    } else {
-      // Update Ty to refer to current element
-      Ty = cast<SequentialType>(Ty)->getElementType();
-
-      // Get the array index and the size of each array element.
-      if (int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue())
-        Result += (uint64_t)arrayIdx * getTypeAllocSize(Ty);
-    }
-  }
-
-  return Result;
-}
-
-/// getPreferredAlignment - Return the preferred alignment of the specified
-/// global.  This includes an explicitly requested alignment (if the global
-/// has one).
-unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
-  Type *ElemType = GV->getType()->getElementType();
-  unsigned Alignment = getPrefTypeAlignment(ElemType);
-  unsigned GVAlignment = GV->getAlignment();
-  if (GVAlignment >= Alignment) {
-    Alignment = GVAlignment;
-  } else if (GVAlignment != 0) {
-    Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType));
-  }
-
-  if (GV->hasInitializer() && GVAlignment == 0) {
-    if (Alignment < 16) {
-      // If the global is not external, see if it is large.  If so, give it a
-      // larger alignment.
-      if (getTypeSizeInBits(ElemType) > 128)
-        Alignment = 16;    // 16-byte alignment.
-    }
-  }
-  return Alignment;
-}
-
-/// getPreferredAlignmentLog - Return the preferred alignment of the
-/// specified global, returned in log form.  This includes an explicitly
-/// requested alignment (if the global has one).
-unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const {
-  return Log2_32(getPreferredAlignment(GV));
-}

Removed: llvm/branches/AMDILBackend/lib/Target/TargetELFWriterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetELFWriterInfo.cpp?rev=172540&view=auto
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetELFWriterInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetELFWriterInfo.cpp (removed)
@@ -1,25 +0,0 @@
-//===-- lib/Target/TargetELFWriterInfo.cpp - ELF Writer Info --0-*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the TargetELFWriterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Function.h"
-#include "llvm/Target/TargetELFWriterInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-TargetELFWriterInfo::TargetELFWriterInfo(bool is64Bit_, bool isLittleEndian_) :
-  is64Bit(is64Bit_), isLittleEndian(isLittleEndian_) {
-}
-
-TargetELFWriterInfo::~TargetELFWriterInfo() {}
-

Modified: llvm/branches/AMDILBackend/lib/Target/TargetLibraryInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetLibraryInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetLibraryInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetLibraryInfo.cpp Tue Jan 15 11:16:16 2013
@@ -24,6 +24,16 @@
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "_ZdaPv",
+    "_ZdlPv",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -31,16 +41,29 @@
     "__memcpy_chk",
     "acos",
     "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
     "acosl",
     "asin",
     "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
     "asinl",
     "atan",
     "atan2",
     "atan2f",
     "atan2l",
     "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
     "atanl",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
     "ceil",
     "ceilf",
     "ceill",
@@ -54,6 +77,9 @@
     "coshl",
     "cosl",
     "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
     "exp2",
     "exp2f",
     "exp2l",
@@ -74,6 +100,7 @@
     "fmodl",
     "fputc",
     "fputs",
+    "free",
     "fwrite",
     "iprintf",
     "log",
@@ -86,8 +113,12 @@
     "log2",
     "log2f",
     "log2l",
+    "logb",
+    "logbf",
+    "logbl",
     "logf",
     "logl",
+    "malloc",
     "memchr",
     "memcmp",
     "memcpy",
@@ -97,11 +128,14 @@
     "nearbyint",
     "nearbyintf",
     "nearbyintl",
+    "posix_memalign",
     "pow",
     "powf",
     "powl",
     "putchar",
     "puts",
+    "realloc",
+    "reallocf",
     "rint",
     "rintf",
     "rintl",
@@ -118,14 +152,30 @@
     "sqrt",
     "sqrtf",
     "sqrtl",
+    "stpcpy",
     "strcat",
     "strchr",
+    "strcmp",
     "strcpy",
+    "strcspn",
+    "strdup",
     "strlen",
     "strncat",
     "strncmp",
     "strncpy",
+    "strndup",
     "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
     "tan",
     "tanf",
     "tanh",
@@ -134,7 +184,8 @@
     "tanl",
     "trunc",
     "truncf",
-    "truncl"
+    "truncl",
+    "valloc"
   };
 
 /// initialize - Initialize the set of available library functions based on the
@@ -205,6 +256,21 @@
     TLI.setUnavailable(LibFunc::tanhl);
 
     // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
     TLI.setUnavailable(LibFunc::exp2);
     TLI.setUnavailable(LibFunc::exp2f);
     TLI.setUnavailable(LibFunc::exp2l);
@@ -217,6 +283,9 @@
     TLI.setUnavailable(LibFunc::log1p);
     TLI.setUnavailable(LibFunc::log1pf);
     TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
     TLI.setUnavailable(LibFunc::nearbyint);
     TLI.setUnavailable(LibFunc::nearbyintf);
     TLI.setUnavailable(LibFunc::nearbyintl);
@@ -254,6 +323,10 @@
       TLI.setUnavailable(LibFunc::tanf);
       TLI.setUnavailable(LibFunc::tanhf);
     }
+
+    // Win32 does *not* provide stpcpy.  It is provided on POSIX systems:
+    // http://pubs.opengroup.org/onlinepubs/9699919799/functions/stpcpy.html
+    TLI.setUnavailable(LibFunc::stpcpy);
   }
 }
 

Modified: llvm/branches/AMDILBackend/lib/Target/TargetLoweringObjectFile.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetLoweringObjectFile.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetLoweringObjectFile.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetLoweringObjectFile.cpp Tue Jan 15 11:16:16 2013
@@ -22,7 +22,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Dwarf.h"
@@ -184,7 +184,7 @@
       // Otherwise, just drop it into a mergable constant section.  If we have
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
-      switch (TM.getTargetData()->getTypeAllocSize(C->getType())) {
+      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();

Modified: llvm/branches/AMDILBackend/lib/Target/TargetMachineC.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetMachineC.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetMachineC.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetMachineC.cpp Tue Jan 15 11:16:16 2013
@@ -14,7 +14,7 @@
 #include "llvm-c/Core.h"
 #include "llvm-c/Target.h"
 #include "llvm-c/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -146,7 +146,7 @@
 }
 
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(unwrap(T)->getTargetData());
+  return wrap(unwrap(T)->getDataLayout());
 }
 
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
@@ -158,14 +158,14 @@
 
   std::string error;
 
-  const TargetData* td = TM->getTargetData();
+  const DataLayout* td = TM->getDataLayout();
 
   if (!td) {
-    error = "No TargetData in TargetMachine";
+    error = "No DataLayout in TargetMachine";
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
-  pass.add(new TargetData(*td));
+  pass.add(new DataLayout(*td));
 
   TargetMachine::CodeGenFileType ft;
   switch (codegen) {
@@ -184,7 +184,7 @@
   }
 
   if (TM->addPassesToEmitFile(pass, destf, ft)) {
-    error = "No TargetData in TargetMachine";
+    error = "No DataLayout in TargetMachine";
     *ErrorMessage = strdup(error.c_str());
     return true;
   }

Modified: llvm/branches/AMDILBackend/lib/Target/TargetRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/TargetRegisterInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/TargetRegisterInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/TargetRegisterInfo.cpp Tue Jan 15 11:16:16 2013
@@ -20,8 +20,10 @@
 
 TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
                              regclass_iterator RCB, regclass_iterator RCE,
-                             const char *const *subregindexnames)
-  : InfoDesc(ID), SubRegIndexNames(subregindexnames),
+                             const char *const *SRINames,
+                             const unsigned *SRILaneMasks)
+  : InfoDesc(ID), SubRegIndexNames(SRINames),
+    SubRegIndexLaneMasks(SRILaneMasks),
     RegClassBegin(RCB), RegClassEnd(RCE) {
 }
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmLexer.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmLexer.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmLexer.cpp Tue Jan 15 11:16:16 2013
@@ -18,19 +18,19 @@
 using namespace llvm;
 
 namespace {
-  
+
 class X86AsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
-  
+
   bool tentativeIsValid;
   AsmToken tentativeToken;
-  
+
   const AsmToken &lexTentative() {
     tentativeToken = getLexer()->Lex();
     tentativeIsValid = true;
     return tentativeToken;
   }
-  
+
   const AsmToken &lexDefinite() {
     if (tentativeIsValid) {
       tentativeIsValid = false;
@@ -38,7 +38,7 @@
     }
     return getLexer()->Lex();
   }
-  
+
   AsmToken LexTokenATT();
   AsmToken LexTokenIntel();
 protected:
@@ -47,7 +47,7 @@
       SetError(SMLoc(), "No MCAsmLexer installed");
       return AsmToken(AsmToken::Error, "", 0);
     }
-    
+
     switch (AsmInfo.getAssemblerDialect()) {
     default:
       SetError(SMLoc(), "Unhandled dialect");
@@ -71,33 +71,32 @@
 
 AsmToken X86AsmLexer::LexTokenATT() {
   AsmToken lexedToken = lexDefinite();
-  
+
   switch (lexedToken.getKind()) {
   default:
     return lexedToken;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
     return lexedToken;
-      
+
   case AsmToken::Percent: {
     const AsmToken &nextToken = lexTentative();
     if (nextToken.getKind() != AsmToken::Identifier)
       return lexedToken;
 
-      
     if (unsigned regID = MatchRegisterName(nextToken.getString())) {
       lexDefinite();
-        
+
       // FIXME: This is completely wrong when there is a space or other
       // punctuation between the % and the register name.
       StringRef regStr(lexedToken.getString().data(),
-                       lexedToken.getString().size() + 
+                       lexedToken.getString().size() +
                        nextToken.getString().size());
-      
-      return AsmToken(AsmToken::Register, regStr, 
+
+      return AsmToken(AsmToken::Register, regStr,
                       static_cast<int64_t>(regID));
     }
-    
+
     // Match register name failed.  If this is "db[0-7]", match it as an alias
     // for dr[0-7].
     if (nextToken.getString().size() == 3 &&
@@ -113,29 +112,29 @@
       case '6': RegNo = X86::DR6; break;
       case '7': RegNo = X86::DR7; break;
       }
-      
+
       if (RegNo != -1) {
         lexDefinite();
 
         // FIXME: This is completely wrong when there is a space or other
         // punctuation between the % and the register name.
         StringRef regStr(lexedToken.getString().data(),
-                         lexedToken.getString().size() + 
+                         lexedToken.getString().size() +
                          nextToken.getString().size());
-        return AsmToken(AsmToken::Register, regStr, 
+        return AsmToken(AsmToken::Register, regStr,
                         static_cast<int64_t>(RegNo));
       }
     }
-      
-   
+
+
     return lexedToken;
-  }    
+  }
   }
 }
 
 AsmToken X86AsmLexer::LexTokenIntel() {
   const AsmToken &lexedToken = lexDefinite();
-  
+
   switch(lexedToken.getKind()) {
   default:
     return lexedToken;
@@ -144,7 +143,7 @@
     return lexedToken;
   case AsmToken::Identifier: {
     unsigned regID = MatchRegisterName(lexedToken.getString().lower());
-    
+
     if (regID)
       return AsmToken(AsmToken::Register,
                       lexedToken.getString(),

Modified: llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmParser.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmParser.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/AsmParser/X86AsmParser.cpp Tue Jan 15 11:16:16 2013
@@ -11,12 +11,14 @@
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -33,13 +35,16 @@
 class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  ParseInstructionInfo *InstInfo;
 private:
   MCAsmParser &getParser() const { return Parser; }
 
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+             bool MatchingInlineAsm = false) {
+    if (MatchingInlineAsm) return true;
     return Parser.Error(L, Msg, Ranges);
   }
 
@@ -51,23 +56,25 @@
   X86Operand *ParseOperand();
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
-  X86Operand *ParseIntelMemOperand();
+  X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelTypeOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp,
+                             SmallString<64> &Err);
+
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
 
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
-
-  bool MatchInstruction(SMLoc IDLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        SmallVectorImpl<MCInst> &MCInsts);
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
 
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
   /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
@@ -96,14 +103,15 @@
 
 public:
   X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
 
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   virtual bool ParseDirective(AsmToken DirectiveID);
@@ -159,6 +167,7 @@
   } Kind;
 
   SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
 
   union {
     struct {
@@ -172,6 +181,7 @@
 
     struct {
       const MCExpr *Val;
+      bool NeedAsmRewrite;
     } Imm;
 
     struct {
@@ -181,6 +191,7 @@
       unsigned IndexReg;
       unsigned Scale;
       unsigned Size;
+      bool NeedSizeDir;
     } Mem;
   };
 
@@ -191,8 +202,11 @@
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
 
   virtual void print(raw_ostream &OS) const {}
 
@@ -216,6 +230,11 @@
     return Imm.Val;
   }
 
+  bool needAsmRewrite() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.NeedAsmRewrite;
+  }
+
   const MCExpr *getMemDisp() const {
     assert(Kind == Memory && "Invalid access!");
     return Mem.Disp;
@@ -312,6 +331,20 @@
     return isImmSExti64i32Value(CE->getValue());
   }
 
+  unsigned getMemSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Size;
+  }
+
+  bool isOffsetOf() const {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needSizeDirective() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.NeedSizeDir;
+  }
+
   bool isMem() const { return Kind == Memory; }
   bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
@@ -437,21 +470,25 @@
     return Res;
   }
 
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) {
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               SMLoc OffsetOfLoc = SMLoc()) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
+    Res->OffsetOfLoc = OffsetOfLoc;
     return Res;
   }
 
-  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc,
+                               bool NeedRewrite = true){
     X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
     Res->Imm.Val = Val;
+    Res->Imm.NeedAsmRewrite = NeedRewrite;
     return Res;
   }
 
   /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
-                               SMLoc EndLoc, unsigned Size = 0) {
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0, bool NeedSizeDir = false){
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -459,6 +496,7 @@
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 
@@ -466,7 +504,7 @@
   static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
                                unsigned BaseReg, unsigned IndexReg,
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0) {
+                               unsigned Size = 0, bool NeedSizeDir = false) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -481,6 +519,7 @@
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 };
@@ -510,12 +549,13 @@
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
   RegNo = 0;
-  if (!isParsingIntelSyntax()) {
-    const AsmToken &TokPercent = Parser.getTok();
-    assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
-    StartLoc = TokPercent.getLoc();
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
     Parser.Lex(); // Eat percent token.
-  }
 
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -621,23 +661,25 @@
 
 /// getIntelMemOperandSize - Return intel memory operand size.
 static unsigned getIntelMemOperandSize(StringRef OpStr) {
-  unsigned Size = 0;
-  if (OpStr == "BYTE") Size = 8;
-  if (OpStr == "WORD") Size = 16;
-  if (OpStr == "DWORD") Size = 32;
-  if (OpStr == "QWORD") Size = 64;
-  if (OpStr == "XWORD") Size = 80;
-  if (OpStr == "XMMWORD") Size = 128;
-  if (OpStr == "YMMWORD") Size = 256;
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("QWORD", "qword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Default(0);
   return Size;
 }
 
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, 
                                                    unsigned Size) {
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
-  SMLoc Start = Parser.getTok().getLoc(), End;
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start = Tok.getLoc(), End;
 
-  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
   // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
 
   // Eat '['
@@ -653,15 +695,17 @@
       if (getLexer().isNot(AsmToken::RBrac))
         return ErrorOperand(Start, "Expected ']' token!");
       Parser.Lex();
+      End = Tok.getLoc();
       return X86Operand::CreateMem(Disp, Start, End, Size);
     }
   } else if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
-      SMLoc Loc = Parser.getTok().getLoc();
+      SMLoc Loc = Tok.getLoc();
       if (getLexer().is(AsmToken::RBrac)) {
         // Handle '[' number ']'
         Parser.Lex();
+        End = Tok.getLoc();
         const MCExpr *Disp = MCConstantExpr::Create(Val, getContext());
         if (SegReg)
           return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale,
@@ -670,7 +714,7 @@
       } else if (getLexer().is(AsmToken::Star)) {
         // Handle '[' Scale*IndexReg ']'
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -678,16 +722,27 @@
         return ErrorOperand(Loc, "Unexpected token");
   }
 
-  if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
-    bool isPlus = getLexer().is(AsmToken::Plus);
+  // Parse ][ as a plus.
+  bool ExpectRBrac = true;
+  if (getLexer().is(AsmToken::RBrac)) {
+    ExpectRBrac = false;
     Parser.Lex();
-    SMLoc PlusLoc = Parser.getTok().getLoc();
+    End = Tok.getLoc();
+  }
+
+  if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus) ||
+      getLexer().is(AsmToken::LBrac)) {
+    ExpectRBrac = true;
+    bool isPlus = getLexer().is(AsmToken::Plus) ||
+      getLexer().is(AsmToken::LBrac);
+    Parser.Lex(); 
+    SMLoc PlusLoc = Tok.getLoc();
     if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
       if (getLexer().is(AsmToken::Star)) {
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -698,21 +753,48 @@
         return ErrorOperand(PlusLoc, "unexpected token after +");
     } else if (getLexer().is(AsmToken::Identifier)) {
       // This could be an index register or a displacement expression.
-      End = Parser.getTok().getLoc();
+      End = Tok.getLoc();
       if (!IndexReg)
         ParseRegister(IndexReg, Start, End);
       else if (getParser().ParseExpression(Disp, End)) return 0;
     }
   }
+  
+  // Parse ][ as a plus.
+  if (getLexer().is(AsmToken::RBrac)) {
+    ExpectRBrac = false;
+    Parser.Lex();
+    End = Tok.getLoc();
+    if (getLexer().is(AsmToken::LBrac)) {
+      ExpectRBrac = true;
+      Parser.Lex();
+      if (getParser().ParseExpression(Disp, End))
+        return 0;
+    }
+  } else if (ExpectRBrac) {
+      if (getParser().ParseExpression(Disp, End))
+        return 0;
+  }
 
-  if (getLexer().isNot(AsmToken::RBrac))
-    if (getParser().ParseExpression(Disp, End)) return 0;
+  if (ExpectRBrac) {
+    if (getLexer().isNot(AsmToken::RBrac))
+      return ErrorOperand(End, "expected ']' token!");
+    Parser.Lex();
+    End = Tok.getLoc();
+  }
 
-  End = Parser.getTok().getLoc();
-  if (getLexer().isNot(AsmToken::RBrac))
-    return ErrorOperand(End, "expected ']' token!");
-  Parser.Lex();
-  End = Parser.getTok().getLoc();
+  // Parse the dot operator (e.g., [ebx].foo.bar).
+  if (Tok.getString().startswith(".")) {
+    SmallString<64> Err;
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, &NewDisp, Err))
+      return ErrorOperand(Tok.getLoc(), Err);
+    
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  End = Tok.getLoc();
 
   // handle [-42]
   if (!BaseReg && !IndexReg)
@@ -723,15 +805,15 @@
 }
 
 /// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand() {
+X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc Start = Parser.getTok().getLoc(), End;
-  unsigned SegReg = 0;
+  SMLoc End;
 
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex();
-    assert (Tok.getString() == "PTR" && "Unexpected token!");
+    assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") &&
+            "Unexpected token!");
     Parser.Lex();
   }
 
@@ -750,12 +832,164 @@
 
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getParser().ParseExpression(Disp, End)) return 0;
-  return X86Operand::CreateMem(Disp, Start, End, Size);
+  End = Parser.getTok().getLoc();
+
+  bool NeedSizeDir = false;
+  if (!Size && isParsingInlineAsm()) {
+    if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
+      const MCSymbol &Sym = SymRef->getSymbol();
+      // FIXME: The SemaLookup will fail if the name is anything other then an
+      // identifier.
+      // FIXME: Pass a valid SMLoc.
+      SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size);
+      NeedSizeDir = Size > 0;
+    }
+  }
+  if (!isParsingInlineAsm())
+    return X86Operand::CreateMem(Disp, Start, End, Size);
+  else
+    // When parsing inline assembly we set the base register to a non-zero value
+    // as we don't know the actual value at this time.  This is necessary to
+    // get the matching correct in some cases.
+    return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
+                                 /*Scale*/1, Start, End, Size, NeedSizeDir);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                         const MCExpr **NewDisp,
+                                         SmallString<64> &Err) {
+  AsmToken Tok = *&Parser.getTok();
+  uint64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) {
+    OrigDispVal = OrigDisp->getValue();
+  } else {
+    Err = "Non-constant offsets are not supported!";
+    return true;
+  }
+
+  // Drop the '.'.
+  StringRef DotDispStr = Tok.getString().drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (Tok.is(AsmToken::Identifier)) {
+    // We should only see an identifier when parsing the original inline asm.
+    // The front-end should rewrite this in terms of immediates.
+    assert (isParsingInlineAsm() && "Unexpected field name!");
+
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp)) {
+      Err = "Unable to lookup field reference!";
+      return true;
+    }
+    DotDispVal = DotDisp;
+  } else {
+    Err = "Unexpected token type!";
+    return true;
+  }
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
+                                                Val));
+  }
+
+  *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
+  SMLoc OffsetOfLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().ParseExpression(Val, End))
+    return ErrorOperand(Start, "Unable to parse expression!");
+
+  End = Parser.getTok().getLoc();
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+  return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc);
+}
+
+/// Parse the 'TYPE' operator.  The TYPE operator returns the size of a C or
+/// C++ type or variable. If the variable is an array, TYPE returns the size of
+/// a single element of the array.
+X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
+  SMLoc TypeLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().ParseExpression(Val, End))
+    return 0;
+
+  End = Parser.getTok().getLoc();
+
+  unsigned Size = 0;
+  if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
+    const MCSymbol &Sym = SymRef->getSymbol();
+    // FIXME: The SemaLookup will fail if the name is anything other then an
+    // identifier.
+    // FIXME: Pass a valid SMLoc.
+    if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size))
+      return ErrorOperand(Start, "Unable to lookup TYPE of expr!");
+
+    Size /= 8; // Size is in terms of bits, but we want bytes in the context.
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, Size));
+
+  const MCExpr *Imm = MCConstantExpr::Create(Size, getContext());
+  return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false);
 }
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   SMLoc Start = Parser.getTok().getLoc(), End;
 
+  // offset operator.
+  StringRef AsmTokStr = Parser.getTok().getString();
+  if ((AsmTokStr == "offset" || AsmTokStr == "OFFSET") &&
+      isParsingInlineAsm())
+    return ParseIntelOffsetOfOperator(Start);
+
+  // Type directive.
+  if ((AsmTokStr == "type" || AsmTokStr == "TYPE") &&
+      isParsingInlineAsm())
+    return ParseIntelTypeOperator(Start);
+
+  // Unsupported directives.
+  if (isParsingIntelSyntax() &&
+      (AsmTokStr == "size" || AsmTokStr == "SIZE" ||
+       AsmTokStr == "length" || AsmTokStr == "LENGTH"))
+      return ErrorOperand(Start, "Unsupported directive!");
+
   // immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
       getLexer().is(AsmToken::Minus)) {
@@ -769,12 +1003,17 @@
   // register
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
-    End = Parser.getTok().getLoc();
-    return X86Operand::CreateReg(RegNo, Start, End);
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, Parser.getTok().getLoc());
+
+    getParser().Lex(); // Eat the colon.
+    return ParseIntelMemOperand(RegNo, Start);
   }
 
   // mem operand
-  return ParseIntelMemOperand();
+  return ParseIntelMemOperand(0, Start);
 }
 
 X86Operand *X86AsmParser::ParseATTOperand() {
@@ -972,8 +1211,9 @@
 }
 
 bool X86AsmParser::
-ParseInstruction(StringRef Name, SMLoc NameLoc,
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  InstInfo = &Info;
   StringRef PatchedName = Name;
 
   // FIXME: Hack to recognize setneb as setne.
@@ -1509,28 +1749,18 @@
 }
 
 bool X86AsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc,
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out) {
-  SmallVector<MCInst, 2> Insts;
-  bool Error = MatchInstruction(IDLoc, Operands, Insts);
-  if (!Error)
-    for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-      Out.EmitInstruction(Insts[i]);
-  return Error;
-}
-
-bool X86AsmParser::
-MatchInstruction(SMLoc IDLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                 SmallVectorImpl<MCInst> &MCInsts) {
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
-  // Also, MatchInstructionImpl should do actually *do* the EmitInstruction
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
   if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
       Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
@@ -1539,7 +1769,8 @@
     MCInst Inst;
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
 
     const char *Repl =
       StringSwitch<const char*>(Op->getToken())
@@ -1558,28 +1789,30 @@
   }
 
   bool WasOriginallyInvalidOperand = false;
-  unsigned OrigErrorInfo;
   MCInst Inst;
 
   // First, try a direct match.
-  switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo,
+  switch (MatchInstructionImpl(Operands, Inst,
+                               ErrorInfo, MatchingInlineAsm,
                                isParsingIntelSyntax())) {
   default: break;
   case Match_Success:
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
     // individual transformations can chain off each other.
-    while (processInstruction(Inst, Operands))
-      ;
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
 
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
     return false;
   case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled",
+          EmptyRanges, MatchingInlineAsm);
     return true;
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
   case Match_InvalidOperand:
     WasOriginallyInvalidOperand = true;
     break;
@@ -1612,13 +1845,17 @@
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
 
-  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[1];
-  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[2];
-  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[3];
-  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
 
   // Restore the old token.
   Op->setTokenValue(Base);
@@ -1631,7 +1868,9 @@
     (Match3 == Match_Success) + (Match4 == Match_Success);
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
     return false;
   }
 
@@ -1658,7 +1897,7 @@
       OS << "'" << Base << MatchChars[i] << "'";
     }
     OS << ")";
-    Error(IDLoc, OS.str());
+    Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
     return true;
   }
 
@@ -1669,31 +1908,36 @@
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
+        Op->getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
-                   Op->getLocRange());
+                   Ranges, MatchingInlineAsm);
     }
 
     // Recover location info for the operand if we know which was the problem.
-    if (OrigErrorInfo != ~0U) {
-      if (OrigErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     EmptyRanges, MatchingInlineAsm);
 
-      X86Operand *Operand = (X86Operand*)Operands[OrigErrorInfo];
+      X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
       if (Operand->getStartLoc().isValid()) {
         SMRange OperandRange = Operand->getLocRange();
         return Error(Operand->getStartLoc(), "invalid operand for instruction",
-                     OperandRange);
+                     OperandRange, MatchingInlineAsm);
       }
     }
 
-    return Error(IDLoc, "invalid operand for instruction");
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
   }
 
   // If one instruction matched with a missing feature, report this as a
   // missing feature.
   if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
       (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled",
+          EmptyRanges, MatchingInlineAsm);
     return true;
   }
 
@@ -1701,12 +1945,14 @@
   // operand failure.
   if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
       (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
-    Error(IDLoc, "invalid operand for instruction");
+    Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+          MatchingInlineAsm);
     return true;
   }
 
   // If all of these were an outright failure, report it in a useless way.
-  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix");
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRanges, MatchingInlineAsm);
   return true;
 }
 
@@ -1717,7 +1963,10 @@
     return ParseDirectiveWord(2, DirectiveID.getLoc());
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
-  else if (IDVal.startswith(".intel_syntax")) {
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {

Modified: llvm/branches/AMDILBackend/lib/Target/X86/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/CMakeLists.txt?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/CMakeLists.txt (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/CMakeLists.txt Tue Jan 15 11:16:16 2013
@@ -17,7 +17,6 @@
   X86AsmPrinter.cpp
   X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
-  X86ELFWriterInfo.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp

Modified: llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.cpp Tue Jan 15 11:16:16 2013
@@ -44,7 +44,7 @@
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) {
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
@@ -95,8 +95,8 @@
 ///                   be a pointer to a MemoryObject.
 /// @param byte     - A pointer to the byte to be read.
 /// @param address  - The address to be read.
-static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
-  MemoryObject* region = static_cast<MemoryObject*>(arg);
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
   return region->readByte(address, byte);
 }
 
@@ -135,10 +135,10 @@
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
-                              (void*)&region,
+                              (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (void*)MII,
+                              (const void*)MII,
                               address,
                               fMode);
 
@@ -379,6 +379,8 @@
   }
 
   switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
     return;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86Disassembler.h Tue Jan 15 11:16:16 2013
@@ -78,7 +78,7 @@
   uint16_t operands;
 
 #define INSTRUCTION_IDS               \
-  unsigned instructionIDs;
+  uint16_t instructionIDs;
 
 #include "X86DisassemblerDecoderCommon.h"
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c Tue Jan 15 11:16:16 2013
@@ -138,6 +138,10 @@
     if (modFromModRM(modRM) == 0x3)
       return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
     return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
   case MODRM_FULL:
     return modRMTable[dec->instructionIDs+modRM];
   }
@@ -200,7 +204,7 @@
                              insn->readerCursor + offset);        \
       if (ret)                                                    \
         return ret;                                               \
-      combined = combined | ((type)byte << ((type)offset * 8));   \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
     }                                                             \
     *ptr = combined;                                              \
     insn->readerCursor += sizeof(type);                           \
@@ -690,7 +694,7 @@
  * @param orig  - The instruction that is not 16-bit
  * @param equiv - The instruction that is 16-bit
  */
-static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
+static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
   off_t i;
   
   for (i = 0;; i++) {
@@ -719,7 +723,7 @@
  * @return      - 0 if the ModR/M could be read when needed or was not needed;
  *                nonzero otherwise.
  */
-static int getID(struct InternalInstruction* insn, void *miiArg) {
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
   uint8_t attrMask;
   uint16_t instructionID;
   
@@ -856,7 +860,7 @@
     specWithOpSizeName =
       x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
 
-    if (is16BitEquvalent(specName, specWithOpSizeName)) {
+    if (is16BitEquivalent(specName, specWithOpSizeName)) {
       insn->instructionID = instructionIDWithOpsize;
       insn->spec = specifierForUID(instructionIDWithOpsize);
     } else {
@@ -1621,10 +1625,10 @@
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode) {
   memset(insn, 0, sizeof(struct InternalInstruction));

Modified: llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h Tue Jan 15 11:16:16 2013
@@ -24,7 +24,7 @@
   uint16_t operands;
 
 #define INSTRUCTION_IDS     \
-  unsigned instructionIDs;
+  uint16_t instructionIDs;
 
 #include "X86DisassemblerDecoderCommon.h"
 
@@ -403,7 +403,7 @@
  *                  be read from.
  * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
  */
-typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
 
 /*
  * dlog_t - Type for the logging function that the consumer can provide to
@@ -422,7 +422,7 @@
   /* Reader interface (C) */
   byteReader_t reader;
   /* Opaque value passed to the reader */
-  void* readerArg;
+  const void* readerArg;
   /* The address of the next byte to read via the reader */
   uint64_t readerCursor;
 
@@ -561,10 +561,10 @@
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
@@ -579,7 +579,7 @@
                           unsigned line,
                           const char *s);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
 
 #ifdef __cplusplus
 }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h Tue Jan 15 11:16:16 2013
@@ -160,6 +160,10 @@
  * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
  *                  corresponds to one instruction; otherwise, it corresponds to
  *                  a different instruction.
+ * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+ *                  divided by 8 is used to select instruction; otherwise, each
+ *                  value of the ModR/M byte could correspond to a different
+ *                  instruction.
  * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
                     corresponds to instructions that use reg field as opcode
  * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
@@ -169,6 +173,7 @@
 #define MODRMTYPES            \
   ENUM_ENTRY(MODRM_ONEENTRY)  \
   ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
   ENUM_ENTRY(MODRM_SPLITREG)  \
   ENUM_ENTRY(MODRM_FULL)
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp Tue Jan 15 11:16:16 2013
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
 #include "X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -33,11 +34,19 @@
 
 void X86ATTInstPrinter::printRegName(raw_ostream &OS,
                                      unsigned RegNo) const {
-  OS << '%' << getRegisterName(RegNo);
+  OS << markup("<reg:")
+     << '%' << getRegisterName(RegNo)
+     << markup(">");
 }
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                   StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
@@ -52,7 +61,8 @@
 
 void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                    raw_ostream &O) {
-  switch (MI->getOperand(Op).getImm()) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
   default: llvm_unreachable("Invalid ssecc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
@@ -70,6 +80,30 @@
   case  0xd: O << "ge"; break;
   case  0xe: O << "gt"; break;
   case  0xf: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
   case 0x10: O << "eq_os"; break;
   case 0x11: O << "lt_oq"; break;
   case 0x12: O << "le_oq"; break;
@@ -89,12 +123,12 @@
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
 /// emitted.
-void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
     O << Op.getImm();
@@ -119,17 +153,21 @@
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    O << '%' << getRegisterName(Op.getReg());
+    printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
     // Print X86 immediates as signed values.
-    O << '$' << (int64_t)Op.getImm();
+    O << markup("<imm:")
+      << '$' << (int64_t)Op.getImm()
+      << markup(">");
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
     
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << '$' << *Op.getExpr();
+    O << markup("<imm:")
+      << '$' << *Op.getExpr()
+      << markup(">");
   }
 }
 
@@ -140,6 +178,8 @@
   const MCOperand &DispSpec = MI->getOperand(Op+3);
   const MCOperand &SegReg = MI->getOperand(Op+4);
   
+  O << markup("<mem:");
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+4, O);
@@ -164,9 +204,15 @@
       O << ',';
       printOperand(MI, Op+2, O);
       unsigned ScaleVal = MI->getOperand(Op+1).getImm();
-      if (ScaleVal != 1)
-        O << ',' << ScaleVal;
+      if (ScaleVal != 1) {
+        O << ','
+	  << markup("<imm:")
+          << ScaleVal
+	  << markup(">");
+      }
     }
     O << ')';
   }
+
+  O << markup(">");
 }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h Tue Jan 15 11:16:16 2013
@@ -40,7 +40,8 @@
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);

Modified: llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp Tue Jan 15 11:16:16 2013
@@ -1,4 +1,4 @@
-//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===//
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file includes code for rendering MCInst instances as AT&T-style
+// This file includes code for rendering MCInst instances as Intel-style
 // assembly.
 //
 //===----------------------------------------------------------------------===//
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
 #include "X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
@@ -32,6 +33,12 @@
 
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                     StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
   printInstruction(MI, OS);
 
   // Next always print the annotation.
@@ -44,7 +51,8 @@
 
 void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
-  switch (MI->getOperand(Op).getImm()) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
   default: llvm_unreachable("Invalid ssecc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
@@ -62,6 +70,30 @@
   case  0xd: O << "ge"; break;
   case  0xe: O << "gt"; break;
   case  0xf: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
   case 0x10: O << "eq_os"; break;
   case 0x11: O << "lt_oq"; break;
   case 0x12: O << "le_oq"; break;
@@ -78,14 +110,13 @@
   case 0x1d: O << "ge_oq"; break;
   case 0x1e: O << "gt_oq"; break;
   case 0x1f: O << "true_us"; break;
-
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.
-void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
     O << Op.getImm();
@@ -153,8 +184,7 @@
     printOperand(MI, Op+2, O);
     NeedPlus = true;
   }
-  
-  
+
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");

Modified: llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h Tue Jan 15 11:16:16 2013
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class prints an X86 MCInst to intel style .s file syntax.
+// This class prints an X86 MCInst to Intel style .s file syntax.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,7 +37,8 @@
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "OPAQUE PTR ";

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp Tue Jan 15 11:16:16 2013
@@ -66,9 +66,10 @@
 };
 
 class X86AsmBackend : public MCAsmBackend {
+  StringRef CPU;
 public:
-  X86AsmBackend(const Target &T)
-    : MCAsmBackend() {}
+  X86AsmBackend(const Target &T, StringRef _CPU)
+    : MCAsmBackend(), CPU(_CPU) {}
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -278,9 +279,9 @@
   Res.setOpcode(RelaxedOp);
 }
 
-/// writeNopData - Write optimal nops to the output file for the \arg Count
+/// writeNopData - Write optimal nops to the output file for the \p Count
 /// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \arg Count is more than the maximum optimal nops.
+/// the \p Count is more than the maximum optimal nops.
 bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   static const uint8_t Nops[10][10] = {
     // nop
@@ -305,6 +306,15 @@
     {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
+  // This CPU doesnt support long nops. If needed add more.
+  // FIXME: Can we get this from the subtarget somehow?
+  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
+      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->Write8(0x90);
+    return true;
+  }
+
   // Write an optimal sequence for the first 15 bytes.
   const uint64_t OptimalCount = (Count < 16) ? Count : 15;
   const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
@@ -327,8 +337,8 @@
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
-  ELFX86AsmBackend(const Target &T, uint8_t _OSABI)
-    : X86AsmBackend(T), OSABI(_OSABI) {
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
+    : X86AsmBackend(T, CPU), OSABI(_OSABI) {
     HasReliableSymbolDifference = true;
   }
 
@@ -340,21 +350,21 @@
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI)
-    : ELFX86AsmBackend(T, OSABI) {}
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI);
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
   }
 };
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI)
-    : ELFX86AsmBackend(T, OSABI) {}
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI);
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
   }
 };
 
@@ -362,8 +372,8 @@
   bool Is64Bit;
 
 public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit)
-    : X86AsmBackend(T)
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
     , Is64Bit(is64Bit) {
   }
 
@@ -374,14 +384,14 @@
 
 class DarwinX86AsmBackend : public X86AsmBackend {
 public:
-  DarwinX86AsmBackend(const Target &T)
-    : X86AsmBackend(T) { }
+  DarwinX86AsmBackend(const Target &T, StringRef CPU)
+    : X86AsmBackend(T, CPU) { }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_32AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {}
+  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -392,8 +402,8 @@
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_64AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {
+  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {
     HasReliableSymbolDifference = true;
   }
 
@@ -439,28 +449,28 @@
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T);
+    return new DarwinX86_32AsmBackend(T, CPU);
 
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, false);
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_32AsmBackend(T, OSABI);
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T);
+    return new DarwinX86_64AsmBackend(T, CPU);
 
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, true);
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_64AsmBackend(T, OSABI);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
 }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86BaseInfo.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86BaseInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86BaseInfo.h Tue Jan 15 11:16:16 2013
@@ -276,9 +276,9 @@
     MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
     MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
     MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
-    MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50,
-    MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54,
-    MRM_DF = 55,
+    MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50,
+    MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54,
+    MRM_DE = 55, MRM_DF = 56,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
     /// immediates, the first of which is a 16-bit immediate (specified by
@@ -580,11 +580,11 @@
     case X86II::MRM_E8: case X86II::MRM_F0:
     case X86II::MRM_F8: case X86II::MRM_F9:
     case X86II::MRM_D0: case X86II::MRM_D1:
-    case X86II::MRM_D4: case X86II::MRM_D8:
-    case X86II::MRM_D9: case X86II::MRM_DA:
-    case X86II::MRM_DB: case X86II::MRM_DC:
-    case X86II::MRM_DD: case X86II::MRM_DE:
-    case X86II::MRM_DF:
+    case X86II::MRM_D4: case X86II::MRM_D5:
+    case X86II::MRM_D8: case X86II::MRM_D9:
+    case X86II::MRM_DA: case X86II::MRM_DB:
+    case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF:
       return -1;
     }
   }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp Tue Jan 15 11:16:16 2013
@@ -20,7 +20,7 @@
 namespace {
   class X86ELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    X86ELFObjectWriter(bool is64Bit, uint8_t OSABI);
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 
     virtual ~X86ELFObjectWriter();
   protected:
@@ -30,10 +30,11 @@
   };
 }
 
-X86ELFObjectWriter::X86ELFObjectWriter(bool Is64Bit, uint8_t OSABI)
-  : MCELFObjectTargetWriter(Is64Bit, OSABI,
-                            Is64Bit ?  ELF::EM_X86_64 : ELF::EM_386,
-                            /*HasRelocationAddend*/ Is64Bit) {}
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+  : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                            // Only i386 uses Rel instead of RelA.
+                            /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
 
 X86ELFObjectWriter::~X86ELFObjectWriter()
 {}
@@ -48,7 +49,7 @@
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
   unsigned Type;
-  if (is64Bit()) {
+  if (getEMachine() == ELF::EM_X86_64) {
     if (IsPCRel) {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
@@ -130,7 +131,7 @@
       case FK_Data_1: Type = ELF::R_X86_64_8; break;
       }
     }
-  } else {
+  } else if (getEMachine() == ELF::EM_386) {
     if (IsPCRel) {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
@@ -210,15 +211,17 @@
       case FK_Data_1: Type = ELF::R_386_8; break;
       }
     }
-  }
+  } else
+    llvm_unreachable("Unsupported ELF machine type.");
 
   return Type;
 }
 
 MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
-                                               bool Is64Bit,
-                                               uint8_t OSABI) {
+                                               bool IsELF64,
+                                               uint8_t OSABI,
+                                               uint16_t EMachine) {
   MCELFObjectTargetWriter *MOTW =
-    new X86ELFObjectWriter(Is64Bit, OSABI);
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
   return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
 }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp Tue Jan 15 11:16:16 2013
@@ -34,6 +34,10 @@
              clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
              clEnumValEnd));
 
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
 
 void X86MCAsmInfoDarwin::anchor() { }
 
@@ -59,6 +63,7 @@
 
   SupportsDebugInformation = true;
   DwarfUsesInlineInfoSection = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp Tue Jan 15 11:16:16 2013
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -28,8 +29,8 @@
 
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
   MCContext &Ctx;
@@ -51,8 +52,8 @@
     return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
   }
 
-  static unsigned GetX86RegNum(const MCOperand &MO) {
-    return X86_MC::getX86RegNum(MO.getReg());
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
   }
 
   // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -64,8 +65,8 @@
   //  VEX.VVVV    => XMM9 => ~9
   //
   // See table 4-35 of Intel AVX Programming Reference for details.
-  static unsigned char getVEXRegisterEncoding(const MCInst &MI,
-                                              unsigned OpNum) {
+  unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                       unsigned OpNum) const {
     unsigned SrcReg = MI.getOperand(OpNum).getReg();
     unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
     if (X86II::isX86_64ExtendedReg(SrcReg))
@@ -560,15 +561,6 @@
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
@@ -1129,13 +1121,13 @@
   case X86II::MRM_C3: case X86II::MRM_C4:
   case X86II::MRM_C8: case X86II::MRM_C9:
   case X86II::MRM_D0: case X86II::MRM_D1:
-  case X86II::MRM_D4: case X86II::MRM_D8:
-  case X86II::MRM_D9: case X86II::MRM_DA:
-  case X86II::MRM_DB: case X86II::MRM_DC:
-  case X86II::MRM_DD: case X86II::MRM_DE:
-  case X86II::MRM_DF: case X86II::MRM_E8:
-  case X86II::MRM_F0: case X86II::MRM_F8:
-  case X86II::MRM_F9:
+  case X86II::MRM_D4: case X86II::MRM_D5:
+  case X86II::MRM_D8: case X86II::MRM_D9:
+  case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF:
+  case X86II::MRM_E8: case X86II::MRM_F0:
+  case X86II::MRM_F8: case X86II::MRM_F9:
     EmitByte(BaseOpcode, CurByte, OS);
 
     unsigned char MRM;
@@ -1150,6 +1142,7 @@
     case X86II::MRM_D0: MRM = 0xD0; break;
     case X86II::MRM_D1: MRM = 0xD1; break;
     case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
     case X86II::MRM_D8: MRM = 0xD8; break;
     case X86II::MRM_D9: MRM = 0xD9; break;
     case X86II::MRM_DA: MRM = 0xDA; break;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp Tue Jan 15 11:16:16 2013
@@ -209,117 +209,10 @@
   return DWARFFlavour::X86_32_Generic;
 }
 
-/// getX86RegNum - This function maps LLVM register identifiers to their X86
-/// specific numbering, which is used in various places encoding instructions.
-unsigned X86_MC::getX86RegNum(unsigned RegNo) {
-  switch(RegNo) {
-  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
-  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
-  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
-  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
-  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
-    return N86::ESP;
-  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
-    return N86::EBP;
-  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
-    return N86::ESI;
-  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
-    return N86::EDI;
-
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    return N86::EAX;
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    return N86::ECX;
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    return N86::EDX;
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    return N86::EBX;
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    return N86::ESP;
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    return N86::EBP;
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    return N86::ESI;
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    return N86::EDI;
-
-  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
-  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
-    return RegNo-X86::ST0;
-
-  case X86::XMM0: case X86::XMM8:
-  case X86::YMM0: case X86::YMM8: case X86::MM0:
-    return 0;
-  case X86::XMM1: case X86::XMM9:
-  case X86::YMM1: case X86::YMM9: case X86::MM1:
-    return 1;
-  case X86::XMM2: case X86::XMM10:
-  case X86::YMM2: case X86::YMM10: case X86::MM2:
-    return 2;
-  case X86::XMM3: case X86::XMM11:
-  case X86::YMM3: case X86::YMM11: case X86::MM3:
-    return 3;
-  case X86::XMM4: case X86::XMM12:
-  case X86::YMM4: case X86::YMM12: case X86::MM4:
-    return 4;
-  case X86::XMM5: case X86::XMM13:
-  case X86::YMM5: case X86::YMM13: case X86::MM5:
-    return 5;
-  case X86::XMM6: case X86::XMM14:
-  case X86::YMM6: case X86::YMM14: case X86::MM6:
-    return 6;
-  case X86::XMM7: case X86::XMM15:
-  case X86::YMM7: case X86::YMM15: case X86::MM7:
-    return 7;
-
-  case X86::ES: return 0;
-  case X86::CS: return 1;
-  case X86::SS: return 2;
-  case X86::DS: return 3;
-  case X86::FS: return 4;
-  case X86::GS: return 5;
-
-  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
-  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
-  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
-  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
-  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
-  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
-  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
-  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
-
-  // Pseudo index registers are equivalent to a "none"
-  // scaled index (See Intel Manual 2A, table 2-3)
-  case X86::EIZ:
-  case X86::RIZ:
-    return 4;
-
-  default:
-    assert((int(RegNo) > 0) && "Unknown physical register!");
-    return 0;
-  }
-}
-
 void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
   for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
-    int SEH = X86_MC::getX86RegNum(Reg);
-    switch (Reg) {
-    case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
-    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
-    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-      SEH += 8;
-      break;
-    }
+    unsigned SEH = MRI->getEncodingValue(Reg);
     MRI->mapLLVMRegToSEHReg(Reg, SEH);
   }
 }
@@ -379,11 +272,15 @@
       MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
       MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.getEnvironment() == Triple::ELF) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
   } else if (TheTriple.getOS() == Triple::Win32) {
     MAI = new X86MCAsmInfoMicrosoft(TheTriple);
   } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
   } else {
+    // The default is ELF.
     MAI = new X86ELFMCAsmInfo(TheTriple);
   }
 
@@ -465,7 +362,7 @@
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
 
-  if (TheTriple.isOSWindows())
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
   return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h Tue Jan 15 11:16:16 2013
@@ -64,8 +64,6 @@
 
   unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
 
-  unsigned getX86RegNum(unsigned RegNo);
-
   void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
 
   /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
@@ -80,8 +78,8 @@
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
@@ -91,8 +89,9 @@
 
 /// createX86ELFObjectWriter - Construct an X86 ELF object writer.
 MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
-                                         bool Is64Bit,
-                                         uint8_t OSABI);
+                                         bool IsELF64,
+                                         uint8_t OSABI,
+                                         uint16_t EMachine);
 /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
 MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
 } // End llvm namespace

Modified: llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp Tue Jan 15 11:16:16 2013
@@ -11,11 +11,13 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Object/MachOFormat.h"
 
 using namespace llvm;
@@ -23,7 +25,7 @@
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
-  void RecordScatteredRelocation(MachObjectWriter *Writer,
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
                                  const MCAssembler &Asm,
                                  const MCAsmLayout &Layout,
                                  const MCFragment *Fragment,
@@ -335,7 +337,7 @@
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
-void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     const MCAssembler &Asm,
                                                     const MCAsmLayout &Layout,
                                                     const MCFragment *Fragment,
@@ -381,6 +383,19 @@
   // Relocations are written out in reverse order, so the PAIR comes first.
   if (Type == macho::RIT_Difference ||
       Type == macho::RIT_Generic_LocalDifference) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
     macho::RelocationEntry MRE;
     MRE.Word0 = ((0         <<  0) |
                  (macho::RIT_Pair  << 24) |
@@ -389,6 +404,16 @@
                  macho::RF_Scattered);
     MRE.Word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
   }
 
   macho::RelocationEntry MRE;
@@ -399,6 +424,7 @@
                macho::RF_Scattered);
   MRE.Word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
 }
 
 void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
@@ -469,9 +495,11 @@
   // If this is a difference or a defined symbol plus an offset, then we need a
   // scattered relocation entry. Differences always require scattered
   // relocations.
-  if (Target.getSymB())
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
+  if (Target.getSymB()) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
 
   // Get the symbol data, if any.
   MCSymbolData *SD = 0;
@@ -483,9 +511,13 @@
   uint32_t Offset = Target.getConstant();
   if (IsPCRel)
     Offset += 1 << Log2Size;
-  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in RecordScatteredRelocation()
+  // for details).
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) &&
+      RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                Target, Log2Size, FixedValue))
+    return;
 
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();

Modified: llvm/branches/AMDILBackend/lib/Target/X86/README-SSE.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/README-SSE.txt?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/README-SSE.txt (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/README-SSE.txt Tue Jan 15 11:16:16 2013
@@ -941,3 +941,15 @@
 cost of reduced accuracy.
 
 //===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86.td Tue Jan 15 11:16:16 2013
@@ -17,14 +17,14 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
-// X86 Subtarget state.
+// X86 Subtarget state
 //
 
 def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
                                   "64-bit mode (x86_64)">;
 
 //===----------------------------------------------------------------------===//
-// X86 Subtarget features.
+// X86 Subtarget features
 //===----------------------------------------------------------------------===//
 
 def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
@@ -97,7 +97,7 @@
                                       [FeatureAVX, FeatureSSE4A]>;
 def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
                                       "Enable XOP instructions",
-                                      [FeatureAVX, FeatureSSE4A]>;
+                                      [FeatureFMA4]>;
 def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                                           "HasVectorUAMem", "true",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
@@ -118,8 +118,13 @@
                                       "Support BMI instructions">;
 def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                          "HasSlowDivide", "true",
+                          "Use small divide for positive values less than 256">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -159,8 +164,9 @@
                                FeatureSlowBTMem]>;
 def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
-                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
+def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                               FeatureSlowDivide]>;
 // "Arrandale" along with corei3 and corei5
 def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,
@@ -188,7 +194,8 @@
                                FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                                FeatureF16C, FeatureFSGSBase,
                                FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                               FeatureBMI2, FeatureFMA]>;
+                               FeatureBMI2, FeatureFMA,
+                               FeatureRTM]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -226,7 +233,8 @@
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI]>;
+                               FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
 def : Proc<"winchip2",        [Feature3DNow]>;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.cpp Tue Jan 15 11:16:16 2013
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmPrinter.h"
-#include "X86MCInstLower.h"
 #include "X86.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "X86MachineFunctionInfo.h"
@@ -206,10 +205,10 @@
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
 /// example, a $ is not emitted.
-void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
+void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo,
                                     raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
@@ -233,15 +232,17 @@
 
 
 void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
-                                 raw_ostream &O, const char *Modifier) {
+                                 raw_ostream &O, const char *Modifier,
+                                 unsigned AsmVariant) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type!");
   case MachineOperand::MO_Register: {
-    O << '%';
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-      EVT VT = (strcmp(Modifier+6,"64") == 0) ?
+      MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
         MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
                     ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
       Reg = getX86SubSuperRegister(Reg, VT);
@@ -265,46 +266,6 @@
   }
 }
 
-void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op,
-                               raw_ostream &O) {
-  unsigned char value = MI->getOperand(Op).getImm();
-  switch (value) {
-  default: llvm_unreachable("Invalid ssecc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  case 0x10: O << "eq_os"; break;
-  case 0x11: O << "lt_oq"; break;
-  case 0x12: O << "le_oq"; break;
-  case 0x13: O << "unord_s"; break;
-  case 0x14: O << "neq_us"; break;
-  case 0x15: O << "nlt_uq"; break;
-  case 0x16: O << "nle_uq"; break;
-  case 0x17: O << "ord_s"; break;
-  case 0x18: O << "eq_us"; break;
-  case 0x19: O << "nge_uq"; break;
-  case 0x1a: O << "ngt_uq"; break;
-  case 0x1b: O << "false_os"; break;
-  case 0x1c: O << "neq_os"; break;
-  case 0x1d: O << "ge_oq"; break;
-  case 0x1e: O << "gt_oq"; break;
-  case 0x1f: O << "true_us"; break;
-  }
-}
-
 void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
                                          raw_ostream &O, const char *Modifier) {
   const MachineOperand &BaseReg  = MI->getOperand(Op);
@@ -363,10 +324,51 @@
   printLeaMemReference(MI, Op, O, Modifier);
 }
 
-void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op,
-                                  raw_ostream &O) {
-  O << *MF->getPICBaseSymbol() << '\n';
-  O << *MF->getPICBaseSymbol() << ':';
+void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                                           raw_ostream &O, const char *Modifier,
+                                           unsigned AsmVariant){
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+  const MachineOperand &SegReg   = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O, Modifier, AsmVariant);
+    O << ':';
+  }
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  assert (DispSpec.isImm() && "Displacement is not an immediate!");
+  int64_t DispVal = DispSpec.getImm();
+  if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+    if (NeedPlus) {
+      if (DispVal > 0)
+        O << " + ";
+      else {
+        O << " - ";
+        DispVal = -DispVal;
+      }
+    }
+    O << DispVal;
+  }  
+  O << ']';
 }
 
 bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
@@ -457,7 +459,7 @@
       return false;
 
     case 'P': // This is the operand of a call, treat specially.
-      print_pcrel_imm(MI, OpNo, O);
+      printPCRelImm(MI, OpNo, O);
       return false;
 
     case 'n':  // Negate the immediate or print a '-' before the operand.
@@ -471,7 +473,7 @@
     }
   }
 
-  printOperand(MI, OpNo, O);
+  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
   return false;
 }
 
@@ -479,6 +481,11 @@
                                           unsigned OpNo, unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(MI, OpNo, O);
+    return false;
+  }
+
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
@@ -680,7 +687,7 @@
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const TargetData *TD = TM.getTargetData();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86AsmPrinter.h Tue Jan 15 11:16:16 2013
@@ -34,47 +34,48 @@
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
-  virtual const char *getPassName() const {
+  virtual const char *getPassName() const LLVM_OVERRIDE {
     return "X86 AT&T-Style Assembly Printer";
   }
 
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
 
-  virtual void EmitStartOfAsmFile(Module &M);
+  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
 
-  virtual void EmitEndOfAsmFile(Module &M);
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
 
-  virtual void EmitInstruction(const MachineInstr *MI);
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
 
   void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
 
   // These methods are used by the tablegen'erated instruction printer.
   void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
-  void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+                    const char *Modifier = 0, unsigned AsmVariant = 0);
+  void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &OS);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &OS);
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) LLVM_OVERRIDE;
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &OS) LLVM_OVERRIDE;
 
-  void printMachineInstruction(const MachineInstr *MI);
-  void printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O);
   void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
                          const char *Modifier=NULL);
   void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
                             const char *Modifier=NULL);
 
-  void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O);
+  void printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                              raw_ostream &O, const char *Modifier=NULL,
+                              unsigned AsmVariant = 1);
 
-  bool runOnMachineFunction(MachineFunction &F);
+  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
 
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  virtual MachineLocation
+    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
 };
 
 } // end namespace llvm

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86COFFMachineModuleInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86COFFMachineModuleInfo.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86COFFMachineModuleInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86COFFMachineModuleInfo.h Tue Jan 15 11:16:16 2013
@@ -20,7 +20,7 @@
 
 namespace llvm {
   class X86MachineFunctionInfo;
-  class TargetData;
+  class DataLayout;
 
 /// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
 /// for X86 COFF targets.

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86CallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86CallingConv.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86CallingConv.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86CallingConv.td Tue Jan 15 11:16:16 2013
@@ -88,6 +88,21 @@
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -128,6 +143,10 @@
 
 // This is the return-value convention used for the entire X86 backend.
 def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -235,6 +254,29 @@
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+  
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -324,7 +366,7 @@
   CCIfNest<CCAssignToReg<[EAX]>>,
 
   // The first 2 integer arguments are passed in ECX/EDX
-  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
 
   // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
@@ -408,6 +450,7 @@
 
 // This is the argument convention used for the entire X86 backend.
 def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -426,3 +469,17 @@
 
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
+
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15, 
+                                                  (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86CodeEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86CodeEmitter.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86CodeEmitter.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86CodeEmitter.cpp Tue Jan 15 11:16:16 2013
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
@@ -43,7 +42,7 @@
   template<class CodeEmitter>
   class Emitter : public MachineFunctionPass {
     const X86InstrInfo  *II;
-    const TargetData    *TD;
+    const DataLayout    *TD;
     X86TargetMachine    &TM;
     CodeEmitter         &MCE;
     MachineModuleInfo   *MMI;
@@ -57,7 +56,7 @@
       MCE(mce), PICBaseOffset(0), Is64BitMode(false),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
     Emitter(X86TargetMachine &tm, CodeEmitter &mce,
-            const X86InstrInfo &ii, const TargetData &td, bool is64)
+            const X86InstrInfo &ii, const DataLayout &td, bool is64)
       : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
@@ -110,6 +109,14 @@
     void emitMemModRMByte(const MachineInstr &MI,
                           unsigned Op, unsigned RegOpcodeField,
                           intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const {
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      return TRI->getEncodingValue(RegNo) & 0x7;
+    }
+
+    unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                         unsigned OpNum) const;
   };
 
 template<class CodeEmitter>
@@ -129,13 +136,12 @@
   MCE.setModuleInfo(MMI);
 
   II = TM.getInstrInfo();
-  TD = TM.getTargetData();
+  TD = TM.getDataLayout();
   Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
 
   do {
-    DEBUG(dbgs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
@@ -365,7 +371,7 @@
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
                                             unsigned RegOpcodeFld){
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg)));
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
 }
 
 template<class CodeEmitter>
@@ -503,7 +509,7 @@
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
   if (BaseReg != 0 && BaseReg != X86::RIP)
-    BaseRegNo = X86_MC::getX86RegNum(BaseReg);
+    BaseRegNo = getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
       IndexReg.getReg() == 0 &&
@@ -579,15 +585,15 @@
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
     emitSIBByte(SS, IndexRegNo, 5);
   } else {
-    unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg);
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
     else
       IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
     emitSIBByte(SS, IndexRegNo, BaseRegNo);
@@ -758,10 +764,12 @@
 //  VEX.VVVV    => XMM9 => ~9
 //
 // See table 4-35 of Intel AVX Programming Reference for details.
-static unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
-                                            unsigned OpNum) {
+template<class CodeEmitter>
+unsigned char
+Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
+                                             unsigned OpNum) const {
   unsigned SrcReg = MI.getOperand(OpNum).getReg();
-  unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg());
+  unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
   if (X86II::isX86_64ExtendedReg(SrcReg))
     SrcRegNum |= 8;
 
@@ -923,17 +931,6 @@
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    if (MI.getOperand(i).isImplicit())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
@@ -1248,7 +1245,7 @@
 
   case X86II::AddRegFrm: {
     MCE.emitByte(BaseOpcode +
-                 X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
+                 getX86RegNum(MI.getOperand(CurOp++).getReg()));
 
     if (CurOp == NumOps)
       break;
@@ -1283,7 +1280,7 @@
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
     break;
   }
@@ -1294,7 +1291,7 @@
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       SrcRegNum++;
     emitMemModRMByte(MI, CurOp,
-                X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
     CurOp = SrcRegNum + 1;
     break;
   }
@@ -1310,7 +1307,7 @@
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
     // 2 operands skipped with HasMemOp4, compensate accordingly
     CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
@@ -1332,7 +1329,7 @@
     intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
     emitMemModRMByte(MI, FirstMemOp,
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
+                     getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
@@ -1422,7 +1419,7 @@
     MCE.emitByte(BaseOpcode);
     // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
 
@@ -1455,7 +1452,7 @@
       const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
                                                          : CurOp);
       ++CurOp;
-      unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4;
+      unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
       if (X86II::isX86_64ExtendedReg(MO.getReg()))
         RegNum |= 1 << 7;
       // If there is an additional 5th operand it must be an immediate, which

Removed: llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.cpp?rev=172540&view=auto
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.cpp (removed)
@@ -1,147 +0,0 @@
-//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the X86 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86ELFWriterInfo.h"
-#include "X86Relocations.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-//  Implementation of the X86ELFWriterInfo class
-//===----------------------------------------------------------------------===//
-
-X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_)
-  : TargetELFWriterInfo(is64Bit_, isLittleEndian_) {
-    EMachine = is64Bit ? EM_X86_64 : EM_386;
-  }
-
-X86ELFWriterInfo::~X86ELFWriterInfo() {}
-
-unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
-  if (is64Bit) {
-    switch(MachineRelTy) {
-    case X86::reloc_pcrel_word:
-      return ELF::R_X86_64_PC32;
-    case X86::reloc_absolute_word:
-      return ELF::R_X86_64_32;
-    case X86::reloc_absolute_word_sext:
-      return ELF::R_X86_64_32S;
-    case X86::reloc_absolute_dword:
-      return ELF::R_X86_64_64;
-    case X86::reloc_picrel_word:
-    default:
-      llvm_unreachable("unknown x86_64 machine relocation type");
-    }
-  } else {
-    switch(MachineRelTy) {
-    case X86::reloc_pcrel_word:
-      return ELF::R_386_PC32;
-    case X86::reloc_absolute_word:
-      return ELF::R_386_32;
-    case X86::reloc_absolute_word_sext:
-    case X86::reloc_absolute_dword:
-    case X86::reloc_picrel_word:
-    default:
-      llvm_unreachable("unknown x86 machine relocation type");
-    }
-  }
-}
-
-long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
-                                                    long int Modifier) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32: return Modifier - 4;
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64:
-      return Modifier;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32: return Modifier - 4;
-    case ELF::R_386_32: return Modifier;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32:
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-        return 32;
-    case ELF::R_X86_64_64:
-        return 64;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32:
-    case ELF::R_386_32:
-        return 32;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32:
-        return true;
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64:
-        return false;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32:
-        return true;
-    case ELF::R_386_32:
-        return false;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
-  return is64Bit ?
-    X86::reloc_absolute_dword : X86::reloc_absolute_word;
-}
-
-long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
-                                             unsigned RelOffset,
-                                             unsigned RelTy) const {
-
-  if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32)
-    return SymOffset - (RelOffset + 4);
-
-  llvm_unreachable("computeRelocation unknown for this relocation type");
-}

Removed: llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.h?rev=172540&view=auto
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86ELFWriterInfo.h (removed)
@@ -1,59 +0,0 @@
-//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the X86 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86_ELF_WRITER_INFO_H
-#define X86_ELF_WRITER_INFO_H
-
-#include "llvm/Target/TargetELFWriterInfo.h"
-
-namespace llvm {
-
-  class X86ELFWriterInfo : public TargetELFWriterInfo {
-
-  public:
-    X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_);
-    virtual ~X86ELFWriterInfo();
-
-    /// getRelocationType - Returns the target specific ELF Relocation type.
-    /// 'MachineRelTy' contains the object code independent relocation type
-    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
-
-    /// hasRelocationAddend - True if the target uses an addend in the
-    /// ELF relocation entry.
-    virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
-
-    /// getDefaultAddendForRelTy - Gets the default addend value for a
-    /// relocation entry based on the target ELF relocation type.
-    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
-                                              long int Modifier = 0) const;
-
-    /// getRelTySize - Returns the size of relocatable field in bits
-    virtual unsigned getRelocationTySize(unsigned RelTy) const;
-
-    /// isPCRelativeRel - True if the relocation type is pc relative
-    virtual bool isPCRelativeRel(unsigned RelTy) const;
-
-    /// getJumpTableRelocationTy - Returns the machine relocation type used
-    /// to reference a jumptable.
-    virtual unsigned getAbsoluteLabelMachineRelTy() const;
-
-    /// computeRelocation - Some relocatable fields could be relocated
-    /// directly, avoiding the relocation symbol emission, compute the
-    /// final relocation value for this symbol.
-    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
-                                       unsigned RelTy) const;
-  };
-
-} // end llvm namespace
-
-#endif // X86_ELF_WRITER_INFO_H

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86FastISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86FastISel.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86FastISel.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86FastISel.cpp Tue Jan 15 11:16:16 2013
@@ -45,9 +45,9 @@
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
-  /// StackPtr - Register used as the stack pointer.
+  /// RegInfo - X86 register info.
   ///
-  unsigned StackPtr;
+  const X86RegisterInfo *RegInfo;
 
   /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
   /// floating point ops.
@@ -61,9 +61,9 @@
                        const TargetLibraryInfo *libInfo)
     : FastISel(funcInfo, libInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
-    StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
+    RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -157,9 +157,9 @@
   // For now, require SSE/SSE2 for performing floating-point operations,
   // since x87 requires additional work.
   if (VT == MVT::f64 && !X86ScalarSSEf64)
-     return false;
+    return false;
   if (VT == MVT::f32 && !X86ScalarSSEf32)
-     return false;
+    return false;
   // Similarly, no f80 support yet.
   if (VT == MVT::f80)
     return false;
@@ -710,6 +710,8 @@
 bool X86FastISel::X86SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
 
   if (!FuncInfo.CanLowerReturn)
     return false;
@@ -724,8 +726,7 @@
     return false;
 
   // Don't handle popping bytes on return for now.
-  if (FuncInfo.MF->getInfo<X86MachineFunctionInfo>()
-        ->getBytesToPopOnReturn() != 0)
+  if (X86MFInfo->getBytesToPopOnReturn() != 0)
     return 0;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
@@ -809,6 +810,19 @@
     MRI.addLiveOut(VA.getLocReg());
   }
 
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax.
+  if (Subtarget->is64Bit() && F.hasStructRetAttr()) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            X86::RAX).addReg(Reg);
+    MRI.addLiveOut(X86::RAX);
+  }
+
   // Now emit the RET.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
   return true;
@@ -1527,9 +1541,9 @@
   CallingConv::ID CC = CS.getCallingConv();
   if (CC == CallingConv::Fast || CC == CallingConv::GHC)
     return 0;
-  if (!CS.paramHasAttr(1, Attribute::StructRet))
+  if (!CS.paramHasAttr(1, Attributes::StructRet))
     return 0;
- if (CS.paramHasAttr(1, Attribute::InReg))
+  if (CS.paramHasAttr(1, Attributes::InReg))
     return 0;
   return 4;
 }
@@ -1608,12 +1622,12 @@
     Value *ArgVal = *i;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::ZExt))
       Flags.setZExt();
 
-    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+    if (CS.paramHasAttr(AttrInd, Attributes::ByVal)) {
       PointerType *Ty = cast<PointerType>(ArgVal->getType());
       Type *ElementTy = Ty->getElementType();
       unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
@@ -1627,9 +1641,9 @@
         return false;
     }
 
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+    if (CS.paramHasAttr(AttrInd, Attributes::InReg))
       Flags.setInReg();
-    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+    if (CS.paramHasAttr(AttrInd, Attributes::Nest))
       Flags.setNest();
 
     // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -1771,7 +1785,7 @@
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
-      AM.Base.Reg = StackPtr;
+      AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
       ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
@@ -1897,11 +1911,11 @@
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT.getSimpleVT();
       MyFlags.Used = !CS.getInstruction()->use_empty();
-      if (CS.paramHasAttr(0, Attribute::SExt))
+      if (CS.paramHasAttr(0, Attributes::SExt))
         MyFlags.Flags.setSExt();
-      if (CS.paramHasAttr(0, Attribute::ZExt))
+      if (CS.paramHasAttr(0, Attributes::ZExt))
         MyFlags.Flags.setZExt();
-      if (CS.paramHasAttr(0, Attribute::InReg))
+      if (CS.paramHasAttr(0, Attributes::InReg))
         MyFlags.Flags.setInReg();
       Ins.push_back(MyFlags);
     }
@@ -2014,13 +2028,17 @@
 unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   MVT VT;
   if (!isTypeLegal(C->getType(), VT))
-    return false;
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-  default: return false;
+  default: return 0;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -2058,7 +2076,7 @@
     break;
   case MVT::f80:
     // No f80 support yet.
-    return false;
+    return 0;
   }
 
   // Materialize addresses with LEA instructions.
@@ -2142,28 +2160,28 @@
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-    default: return false;
-    case MVT::f32:
-      if (X86ScalarSSEf32) {
-        Opc = X86::FsFLD0SS;
-        RC  = &X86::FR32RegClass;
-      } else {
-        Opc = X86::LD_Fp032;
-        RC  = &X86::RFP32RegClass;
-      }
-      break;
-    case MVT::f64:
-      if (X86ScalarSSEf64) {
-        Opc = X86::FsFLD0SD;
-        RC  = &X86::FR64RegClass;
-      } else {
-        Opc = X86::LD_Fp064;
-        RC  = &X86::RFP64RegClass;
-      }
-      break;
-    case MVT::f80:
-      // No f80 support yet.
-      return false;
+  default: return false;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
   }
 
   unsigned ResultReg = createResultReg(RC);
@@ -2182,7 +2200,7 @@
   if (!X86SelectAddress(LI->getOperand(0), AM))
     return false;
 
-  X86InstrInfo &XII = (X86InstrInfo&)TII;
+  const X86InstrInfo &XII = (const X86InstrInfo&)TII;
 
   unsigned Size = TD.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86FloatingPoint.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86FloatingPoint.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86FloatingPoint.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86FloatingPoint.cpp Tue Jan 15 11:16:16 2013
@@ -171,6 +171,7 @@
     // Shuffle live registers to match the expectations of successor blocks.
     void finishBlockStack();
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -181,6 +182,7 @@
         dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
       dbgs() << "\n";
     }
+#endif
 
     /// getSlot - Return the stack slot number a particular register number is
     /// in.
@@ -575,8 +577,8 @@
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
     }
-    friend bool LLVM_ATTRIBUTE_USED operator<(unsigned V,
-                                              const TableEntry &TE) {
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
       return V < TE.from;
     }
   };

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86FrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86FrameLowering.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86FrameLowering.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86FrameLowering.cpp Tue Jan 15 11:16:16 2013
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallSet.h"
@@ -313,11 +313,11 @@
   if (CSI.empty()) return;
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const TargetData *TD = TM.getTargetData();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize();
+  int stackGrowth = -RegInfo->getSlotSize();
 
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
@@ -674,7 +674,7 @@
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone).
-  if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
+  if (Is64Bit && !Fn->getFnAttributes().hasAttribute(Attributes::NoRedZone) &&
       !RegInfo->needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
       !MFI->adjustsStack() &&                           // No calls.
@@ -715,9 +715,8 @@
   //        ELSE                        => DW_CFA_offset_extended
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const TargetData *TD = MF.getTarget().getTargetData();
   uint64_t NumBytes = 0;
-  int stackGrowth = -TD->getPointerSize();
+  int stackGrowth = -SlotSize;
 
   if (HasFP) {
     // Calculate required stack adjustment.
@@ -836,8 +835,6 @@
     MI->getOperand(3).setIsDead();
   }
 
-  DL = MBB.findDebugLoc(MBBI);
-
   // If there is an SUB32ri of ESP immediately before this instruction, merge
   // the two. This can be the case when tail call elimination is enabled and
   // the callee has more arguments then the caller.

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86ISelDAGToDAG.cpp Tue Jan 15 11:16:16 2013
@@ -100,6 +100,7 @@
       Base_Reg = Reg;
     }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
@@ -133,6 +134,7 @@
         dbgs() << "nul";
       dbgs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
@@ -189,7 +191,6 @@
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
     SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
@@ -244,13 +245,15 @@
       else if (AM.CP)
         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
                                              AM.Align, AM.Disp, AM.SymbolFlags);
-      else if (AM.ES)
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
-      else if (AM.JT != -1)
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
-      else if (AM.BlockAddr)
-        Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
-                                       true, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
       else
         Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
 
@@ -359,7 +362,7 @@
 /// MoveBelowCallOrigChain - Replace the original chain operand of the call with
 /// load's chain operand and move load below the call's chain operand.
 static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
-                                  SDValue Call, SDValue OrigChain) {
+                               SDValue Call, SDValue OrigChain) {
   SmallVector<SDValue, 8> Ops;
   SDValue Chain = OrigChain.getOperand(0);
   if (Chain.getNode() == Load.getNode())
@@ -383,11 +386,13 @@
   CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
+
+  unsigned NumOps = Call.getNode()->getNumOperands();
   Ops.clear();
   Ops.push_back(SDValue(Load.getNode(), 1));
-  for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+  for (unsigned i = 1, e = NumOps; i != e; ++i)
     Ops.push_back(Call.getOperand(i));
-  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
 }
 
 /// isCalleeLoad - Return true if call address is a load and it can be
@@ -396,6 +401,10 @@
 /// In the case of a tail call, there isn't a callseq node between the call
 /// chain and the load.
 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
     return false;
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
@@ -425,7 +434,8 @@
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptForSize is used in pattern predicates that isel is matching.
-  OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  OptForSize = MF->getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
@@ -433,7 +443,10 @@
 
     if (OptLevel != CodeGenOpt::None &&
         (N->getOpcode() == X86ISD::CALL ||
-         N->getOpcode() == X86ISD::TC_RETURN)) {
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be foled into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
@@ -652,10 +665,16 @@
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
 
     if (N.getOpcode() == X86ISD::WrapperRIP)
       AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
@@ -684,10 +703,12 @@
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
     return false;
   }
 
@@ -1011,7 +1032,7 @@
           AM.IndexReg = ShVal.getNode()->getOperand(0);
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
-          uint64_t Disp = AddVal->getSExtValue() << Val;
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!FoldOffsetIntoAddress(Disp, AM))
             return false;
         }
@@ -1281,7 +1302,9 @@
       // that are not a MemSDNode, and thus don't have proper addrspace info.
       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
-      Parent->getOpcode() != X86ISD::TLSCALL) { // Fixme
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
     // AddrSpace 256 -> GS, 257 -> FS.
@@ -1468,6 +1491,7 @@
   SDValue In1 = Node->getOperand(1);
   SDValue In2L = Node->getOperand(2);
   SDValue In2H = Node->getOperand(3);
+
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
     return NULL;
@@ -1481,159 +1505,13 @@
   return ResNode;
 }
 
-// FIXME: Figure out some way to unify this with the 'or' and other code
-// below.
-SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
-  if (Node->hasAnyUseOfValue(0))
-    return 0;
-
-  // Optimize common patterns for __sync_add_and_fetch and
-  // __sync_sub_and_fetch where the result is not used. This allows us
-  // to use "lock" version of add, sub, inc, dec instructions.
-  // FIXME: Do not use special instructions but instead add the "lock"
-  // prefix to the target node somehow. The extra information will then be
-  // transferred to machine instruction and it denotes the prefix.
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ptr = Node->getOperand(1);
-  SDValue Val = Node->getOperand(2);
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return 0;
-
-  bool isInc = false, isDec = false, isSub = false, isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && CN->getSExtValue() == (int32_t)CN->getSExtValue()) {
-    isCN = true;
-    int64_t CNVal = CN->getSExtValue();
-    if (CNVal == 1)
-      isInc = true;
-    else if (CNVal == -1)
-      isDec = true;
-    else if (CNVal >= 0)
-      Val = CurDAG->getTargetConstant(CNVal, NVT);
-    else {
-      isSub = true;
-      Val = CurDAG->getTargetConstant(-CNVal, NVT);
-    }
-  } else if (Val.hasOneUse() &&
-             Val.getOpcode() == ISD::SUB &&
-             X86::isZeroNode(Val.getOperand(0))) {
-    isSub = true;
-    Val = Val.getOperand(1);
-  }
-
-  DebugLoc dl = Node->getDebugLoc();
-  unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
-  default: return 0;
-  case MVT::i8:
-    if (isInc)
-      Opc = X86::LOCK_INC8m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC8m;
-    else if (isSub) {
-      if (isCN)
-        Opc = X86::LOCK_SUB8mi;
-      else
-        Opc = X86::LOCK_SUB8mr;
-    } else {
-      if (isCN)
-        Opc = X86::LOCK_ADD8mi;
-      else
-        Opc = X86::LOCK_ADD8mr;
-    }
-    break;
-  case MVT::i16:
-    if (isInc)
-      Opc = X86::LOCK_INC16m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC16m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB16mi8;
-        else
-          Opc = X86::LOCK_SUB16mi;
-      } else
-        Opc = X86::LOCK_SUB16mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD16mi8;
-        else
-          Opc = X86::LOCK_ADD16mi;
-      } else
-        Opc = X86::LOCK_ADD16mr;
-    }
-    break;
-  case MVT::i32:
-    if (isInc)
-      Opc = X86::LOCK_INC32m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC32m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB32mi8;
-        else
-          Opc = X86::LOCK_SUB32mi;
-      } else
-        Opc = X86::LOCK_SUB32mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD32mi8;
-        else
-          Opc = X86::LOCK_ADD32mi;
-      } else
-        Opc = X86::LOCK_ADD32mr;
-    }
-    break;
-  case MVT::i64:
-    if (isInc)
-      Opc = X86::LOCK_INC64m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC64m;
-    else if (isSub) {
-      Opc = X86::LOCK_SUB64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi32;
-      }
-    } else {
-      Opc = X86::LOCK_ADD64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi32;
-      }
-    }
-    break;
-  }
-
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  if (isInc || isDec) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  } else {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  }
-}
-
+/// Atomic opcode table
+///
 enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
   OR,
   AND,
   XOR,
@@ -1657,6 +1535,58 @@
 
 static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
   {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
     X86::LOCK_OR8mi,
     X86::LOCK_OR8mr,
     X86::LOCK_OR16mi8,
@@ -1667,7 +1597,7 @@
     X86::LOCK_OR32mr,
     X86::LOCK_OR64mi8,
     X86::LOCK_OR64mi32,
-    X86::LOCK_OR64mr
+    X86::LOCK_OR64mr,
   },
   {
     X86::LOCK_AND8mi,
@@ -1680,7 +1610,7 @@
     X86::LOCK_AND32mr,
     X86::LOCK_AND64mi8,
     X86::LOCK_AND64mi32,
-    X86::LOCK_AND64mr
+    X86::LOCK_AND64mr,
   },
   {
     X86::LOCK_XOR8mi,
@@ -1693,18 +1623,74 @@
     X86::LOCK_XOR32mr,
     X86::LOCK_XOR64mi8,
     X86::LOCK_XOR64mi32,
-    X86::LOCK_XOR64mr
+    X86::LOCK_XOR64mr,
   }
 };
 
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                DebugLoc dl,
+                                                enum AtomicOpc &Op, EVT NVT,
+                                                SDValue Val) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if ((CNVal == 1) || (CNVal == -1)) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
+  DebugLoc dl = Node->getDebugLoc();
+
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
   // version of the arithmetic instruction.
-  // FIXME: Same as for 'add' and 'sub', try to merge those down here.
   SDValue Chain = Node->getOperand(0);
   SDValue Ptr = Node->getOperand(1);
   SDValue Val = Node->getOperand(2);
@@ -1715,6 +1701,8 @@
   // Which index into the table.
   enum AtomicOpc Op;
   switch (Node->getOpcode()) {
+    default:
+      return 0;
     case ISD::ATOMIC_LOAD_OR:
       Op = OR;
       break;
@@ -1724,16 +1712,14 @@
     case ISD::ATOMIC_LOAD_XOR:
       Op = XOR;
       break;
-    default:
-      return 0;
-  }
-
-  bool isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) {
-    isCN = true;
-    Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
   }
+  
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
   switch (NVT.getSimpleVT().SimpleTy) {
@@ -1775,13 +1761,20 @@
 
   assert(Opc != 0 && "Invalid arith lock transform!");
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDValue Ret;
   SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, NVT), 0);
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-  SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+  if (isUnOp) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  }
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
   SDValue RetVals[] = { Undef, Ret };
   return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
@@ -2059,16 +2052,24 @@
   case X86ISD::ATOMSUB64_DAG:
   case X86ISD::ATOMNAND64_DAG:
   case X86ISD::ATOMAND64_DAG:
+  case X86ISD::ATOMMAX64_DAG:
+  case X86ISD::ATOMMIN64_DAG:
+  case X86ISD::ATOMUMAX64_DAG:
+  case X86ISD::ATOMUMIN64_DAG:
   case X86ISD::ATOMSWAP64_DAG: {
     unsigned Opc;
     switch (Opcode) {
-    default: llvm_unreachable("Impossible intrinsic");
+    default: llvm_unreachable("Impossible opcode");
     case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
     case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
     case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
     case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
     case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
     case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMMAX64_DAG:  Opc = X86::ATOMMAX6432;  break;
+    case X86ISD::ATOMMIN64_DAG:  Opc = X86::ATOMMIN6432;  break;
+    case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
+    case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
     case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
     }
     SDNode *RetVal = SelectAtomic64(Node, Opc);
@@ -2077,15 +2078,10 @@
     break;
   }
 
-  case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
-    if (RetVal)
-      return RetVal;
-    break;
-  }
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR: {
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
     SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
     if (RetVal)
       return RetVal;
@@ -2116,10 +2112,11 @@
 
     // Make sure that we don't change the operation by removing bits.
     // This only matters for OR and XOR, AND is unaffected.
-    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
       break;
 
-    unsigned ShlOp, Op = 0;
+    unsigned ShlOp, Op;
     EVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
@@ -2142,6 +2139,7 @@
       ShlOp = X86::SHL32ri;
 
       switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = X86::AND32ri8; break;
       case ISD::OR:  Op =  X86::OR32ri8; break;
       case ISD::XOR: Op = X86::XOR32ri8; break;
@@ -2152,6 +2150,7 @@
       ShlOp = X86::SHL64ri;
 
       switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
       case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
       case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
@@ -2197,13 +2196,16 @@
     SDValue N1 = Node->getOperand(1);
 
     bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
       switch (NVT.getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
       case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
-      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
-      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
       }
     } else {
       switch (NVT.getSimpleVT().SimpleTy) {
@@ -2215,13 +2217,31 @@
       }
     }
 
-    unsigned LoReg, HiReg;
-    switch (NVT.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Unsupported VT!");
-    case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
-    case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
-    case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
-    case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
@@ -2233,22 +2253,47 @@
         std::swap(N0, N1);
     }
 
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
                                           N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
 
     if (foldedLoad) {
+      SDValue Chain;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      SDNode *CNode =
-        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops,
-                               array_lengthof(Ops));
-      InFlag = SDValue(CNode, 1);
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
 
       // Update the chain.
-      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+      ReplaceUses(N1.getValue(1), Chain);
     } else {
-      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag);
-      InFlag = SDValue(CNode, 0);
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        InFlag = SDValue(CNode, 0);
+      }
     }
 
     // Prevent use of AH in a REX instruction by referencing AX instead.
@@ -2273,19 +2318,25 @@
     }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              LoReg, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      if (ResLo.getNode() == 0) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              HiReg, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      if (ResHi.getNode() == 0) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
     return NULL;
@@ -2486,7 +2537,13 @@
                                                         MVT::i8, Reg);
 
         // Emit a testb.
-        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2517,8 +2574,13 @@
         // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
         // target GR8_NOREX registers, so make sure the register class is
         // forced.
-        return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32,
-                                      Subreg, ShiftedImm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2534,7 +2596,13 @@
                                                         MVT::i16, Reg);
 
         // Emit a testw.
-        return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2550,7 +2618,13 @@
                                                         MVT::i32, Reg);
 
         // Emit a testl.
-        return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
     }
     break;
@@ -2605,85 +2679,6 @@
 
     return Result;
   }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPESTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-    SDValue N3 = Node->getOperand(3);
-    SDValue N4 = Node->getOperand(4);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
-                                          X86::EAX, N1, SDValue()).getValue(1);
-    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
-                                  N3, InFlag).getValue(1);
-
-    SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr :
-                                         X86::PCMPESTRIrr;
-    InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                            array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPISTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue Ops[] = { N0, N1, getI8Imm(Imm) };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr :
-                                         X86::PCMPISTRIrr;
-    SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                                    array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
   }
 
   SDNode *ResNode = SelectCode(Node);

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.cpp Tue Jan 15 11:16:16 2013
@@ -66,7 +66,7 @@
 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, DebugLoc dl) {
   EVT VT = Vec.getValueType();
-  assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
+  assert(VT.is256BitVector() && "Unexpected vector size!");
   EVT ElVT = VT.getVectorElementType();
   unsigned Factor = VT.getSizeInBits()/128;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
@@ -85,7 +85,7 @@
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                VecIdx);
 
@@ -105,7 +105,7 @@
     return Result;
 
   EVT VT = Vec.getValueType();
-  assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+  assert(VT.is128BitVector() && "Unexpected vector size!");
 
   EVT ElVT = VT.getVectorElementType();
   EVT ResultVT = Result.getValueType();
@@ -118,7 +118,7 @@
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                      VecIdx);
 }
@@ -158,10 +158,9 @@
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
 
   RegInfo = TM.getRegisterInfo();
-  TD = getTargetData();
+  TD = getDataLayout();
 
   // Set up the TargetLowering object.
   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -180,7 +179,11 @@
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
-  setStackPointerRegisterToSaveRestore(X86StackPtr);
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass i32 with i8 on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+    addBypassSlowDiv(32, 8);
 
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
     // Setup Windows compiler runtime calls.
@@ -453,6 +456,14 @@
     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
   }
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 
   // Darwin ABI issue.
   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
@@ -510,6 +521,10 @@
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
   }
 
   if (Subtarget->hasCmpxchg16b()) {
@@ -541,6 +556,7 @@
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -643,7 +659,9 @@
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
     if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
@@ -735,6 +753,7 @@
     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
@@ -824,12 +843,12 @@
     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
@@ -858,6 +877,7 @@
     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
@@ -870,27 +890,18 @@
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
-
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      EVT VT = (MVT::SimpleValueType)i;
+      MVT VT = (MVT::SimpleValueType)i;
       // Do not attempt to custom lower non-power-of-2 vectors
       if (!isPowerOf2_32(VT.getVectorNumElements()))
         continue;
       // Do not attempt to custom lower non-128-bit vectors
       if (!VT.is128BitVector())
         continue;
-      setOperationAction(ISD::BUILD_VECTOR,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
-                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
@@ -907,23 +918,22 @@
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Do not attempt to promote non-128-bit vectors
       if (!VT.is128BitVector())
         continue;
 
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
     }
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -936,6 +946,18 @@
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   }
 
   if (Subtarget->hasSSE41()) {
@@ -950,6 +972,9 @@
     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
 
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1010,9 +1035,6 @@
     }
   }
 
-  if (Subtarget->hasSSE42())
-    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
-
   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
@@ -1030,25 +1052,32 @@
     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
 
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
@@ -1073,7 +1102,7 @@
     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
 
-    if (Subtarget->hasFMA()) {
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
@@ -1081,6 +1110,7 @@
       setOperationAction(ISD::FMA,             MVT::f32, Custom);
       setOperationAction(ISD::FMA,             MVT::f64, Custom);
     }
+
     if (Subtarget->hasAVX2()) {
       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
@@ -1134,45 +1164,44 @@
     // Custom lower several nodes for 256-bit types.
     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Extract subvector is special because the value type
       // (result) is 128-bit but the source is 256-bit wide.
       if (VT.is128BitVector())
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
       // Do not attempt to custom lower other non-256-bit vectors
       if (!VT.is256BitVector())
         continue;
 
-      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+      MVT VT = (MVT::SimpleValueType)i;
 
       // Do not attempt to promote non-256-bit vectors
       if (!VT.is256BitVector())
         continue;
 
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
     }
   }
 
@@ -1238,10 +1267,8 @@
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1339,7 +1366,7 @@
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = MF.getFunction();
   if (IsZeroVal &&
-      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
@@ -1902,9 +1929,9 @@
         RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
         RC = &X86::FR64RegClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
+      else if (RegVT.is256BitVector())
         RC = &X86::VR256RegClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+      else if (RegVT.is128BitVector())
         RC = &X86::VR128RegClass;
       else if (RegVT == MVT::x86mmx)
         RC = &X86::VR64RegClass;
@@ -2007,7 +2034,8 @@
       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
                                                        TotalNumIntRegs);
 
-      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
+      bool NoImplicitFloatOps = Fn->getFnAttributes().
+        hasAttribute(Attributes::NoImplicitFloat);
       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
              "SSE register cannot be used when SSE is disabled!");
       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2155,16 +2183,14 @@
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx,
-                         bool Is64Bit, int FPDiff, DebugLoc dl) {
+                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
-  int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
-  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
-  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
                        false, false, 0);
@@ -2199,7 +2225,7 @@
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
-                    MF.getFunction()->hasStructRetAttr(),
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
                     Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
@@ -2239,14 +2265,15 @@
   int FPDiff = 0;
   if (isTailCall && !IsSibcall) {
     // Lower arguments at fp - stackoffset + fpdiff.
-    unsigned NumBytesCallerPushed =
-      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
     FPDiff = NumBytesCallerPushed - NumBytes;
 
     // Set the delta of movement of the returnaddr stackslot.
     // But only set if delta is greater than previous delta.
-    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
-      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
   }
 
   if (!IsSibcall)
@@ -2282,7 +2309,7 @@
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::AExt:
-      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+      if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
@@ -2323,7 +2350,8 @@
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (StackPtr.getNode() == 0)
-        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy());
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
@@ -2411,7 +2439,8 @@
           // Copy relative to framepointer.
           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
           if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+            StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                          RegInfo->getStackRegister(),
                                           getPointerTy());
           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
 
@@ -2433,7 +2462,8 @@
                           &MemOpChains2[0], MemOpChains2.size());
 
     // Store the return address to the appropriate stack slot.
-    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(), RegInfo->getSlotSize(),
                                      FPDiff, dl);
   }
 
@@ -2483,7 +2513,8 @@
         OpFlags = X86II::MO_DARWIN_STUB;
       } else if (Subtarget->isPICStyleRIPRel() &&
                  isa<Function>(GV) &&
-                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
+                 cast<Function>(GV)->getFnAttributes().
+                   hasAttribute(Attributes::NonLazyBind)) {
         // If the function is marked as non-lazy, generate an indirect call
         // which loads from the GOT directly. This avoids runtime overhead
         // at the cost of eager binding (and one extra byte of encoding).
@@ -2644,7 +2675,7 @@
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
-  uint64_t SlotSize = TD->getPointerSize();
+  unsigned SlotSize = RegInfo->getSlotSize();
   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
     // Number smaller than 12 so just add the difference.
     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
@@ -2719,6 +2750,7 @@
                                                      bool isVarArg,
                                                      bool isCalleeStructRet,
                                                      bool isCallerStructRet,
+                                                     Type *RetTy,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -2730,6 +2762,13 @@
   // If -tailcallopt is specified, make fastcc functions tail-callable.
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = DAG.getMachineFunction().getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
@@ -2853,7 +2892,7 @@
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3004,7 +3043,7 @@
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = TD->getPointerSize();
+    unsigned SlotSize = RegInfo->getSlotSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -3425,11 +3464,11 @@
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
+  unsigned NumElems = VT.getVectorNumElements();
+
   if (NumElems != 4)
     return false;
 
@@ -3444,11 +3483,11 @@
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
+  unsigned NumElems = VT.getVectorNumElements();
+
   if (NumElems != 4)
     return false;
 
@@ -3461,7 +3500,7 @@
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -3483,10 +3522,12 @@
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
   unsigned NumElems = VT.getVectorNumElements();
 
-  if ((NumElems != 2 && NumElems != 4)
-      || VT.getSizeInBits() > 128)
+  if (NumElems != 2 && NumElems != 4)
     return false;
 
   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
@@ -3525,25 +3566,26 @@
     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
       MatchOddMask = false;
   }
-  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
-  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
 
-  const int *CompactionMask;
-  if (MatchEvenMask)
-    CompactionMask = CompactionMaskEven;
-  else if (MatchOddMask)
-    CompactionMask = CompactionMaskOdd;
-  else
+  if (!MatchEvenMask && !MatchOddMask)
     return SDValue();
 
   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
 
-  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
-                                     UndefNode, CompactionMask);
-  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
-                                     UndefNode, CompactionMask);
-  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
-  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
 }
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3703,7 +3745,7 @@
 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   if (VT.getVectorElementType().getSizeInBits() < 32)
     return false;
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -3725,7 +3767,7 @@
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX || VT.getSizeInBits() != 256)
+  if (!HasAVX || !VT.is256BitVector())
     return false;
 
   // The shuffle result is divided into half A and half B. In total the two
@@ -3817,9 +3859,10 @@
 /// element of vector 2 and the other elements to come from vector 1 in order.
 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
-  unsigned NumOps = VT.getVectorNumElements();
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return false;
+
+  unsigned NumOps = VT.getVectorNumElements();
   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
     return false;
 
@@ -3885,9 +3928,11 @@
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!HasAVX || !VT.is256BitVector())
+    return false;
 
-  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 4)
     return false;
 
   for (unsigned i = 0; i != NumElts/2; ++i)
@@ -3903,7 +3948,7 @@
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   unsigned e = VT.getVectorNumElements() / 2;
@@ -4148,7 +4193,7 @@
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
   if (VT.getVectorNumElements() != 4)
     return false;
@@ -4205,7 +4250,7 @@
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return false;
 
   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
@@ -4591,7 +4636,6 @@
     MVT ShufVT = V.getValueType().getSimpleVT();
     unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
-    SDValue ImmN;
     bool IsUnary;
 
     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
@@ -4747,7 +4791,7 @@
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   // Although the logic below support any bitwidth size, there are no
   // shift instructions which handle more than 128-bit vectors.
-  if (SVOp->getValueType(0).getSizeInBits() > 128)
+  if (!SVOp->getValueType(0).is128BitVector())
     return false;
 
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -4842,7 +4886,7 @@
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
-  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
+  assert(VT.is128BitVector() && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4993,6 +5037,18 @@
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
                                 false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -5006,7 +5062,7 @@
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
+X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->hasAVX())
     return SDValue();
 
@@ -5075,7 +5131,7 @@
     }
   }
 
-  bool Is256 = VT.getSizeInBits() == 256;
+  bool Is256 = VT.is256BitVector();
 
   // Handle the broadcasting a single constant scalar from the constant pool
   // into a vector. On Sandybridge it is still better to load a constant vector
@@ -5131,6 +5187,80 @@
 }
 
 SDValue
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+
+    if (VecIn1.getNode() == 0)
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (VecIn2.getNode() == 0)
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (VecIn1.getNode() == 0)
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
+
+  return NV;
+}
+
+SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
@@ -5237,12 +5367,12 @@
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                              Item, DAG.getIntPtrConstant(0));
         }
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+        assert(VT.is128BitVector() && "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5251,11 +5381,11 @@
       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
         } else {
-          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         }
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5315,7 +5445,7 @@
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     SmallVector<SDValue, 32> V;
     for (unsigned i = 0; i != NumElems; ++i)
       V.push_back(Op.getOperand(i));
@@ -5396,7 +5526,7 @@
     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   }
 
-  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+  if (Values.size() > 1 && VT.is128BitVector()) {
     // Check for a build vector of consecutive loads.
     for (unsigned i = 0; i < NumElems; ++i)
       V[i] = Op.getOperand(i);
@@ -5406,6 +5536,11 @@
     if (LD.getNode())
       return LD;
 
+    // Check for a build vector from mostly shuffle plus few inserting.
+    SDValue Sh = buildFromShuffleMostly(Op, DAG);
+    if (Sh.getNode())
+      return Sh;
+
     // For SSE 4.1, use insertps to put the high elements into the low element.
     if (getSubtarget()->hasSSE41()) {
       SDValue Result;
@@ -5457,39 +5592,13 @@
   return SDValue();
 }
 
-// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
-// them in a MMX register.  This is better than doing a stack convert.
-static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  EVT ResVT = Op.getValueType();
-
-  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
-         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
-  int Mask[2];
-  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
-  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-  InVec = Op.getOperand(1);
-  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    unsigned NumElts = ResVT.getVectorNumElements();
-    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
-                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
-  } else {
-    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
-    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-    Mask[0] = 0; Mask[1] = 2;
-    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-}
-
 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   EVT ResVT = Op.getValueType();
 
-  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -5498,18 +5607,8 @@
   return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  EVT ResVT = Op.getValueType();
-
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 2);
-  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
-         "Unsupported CONCAT_VECTORS for value type");
-
-  // We support concatenate two MMX registers and place them in a MMX register.
-  // This is better than doing a stack convert.
-  if (ResVT.is128BitVector())
-    return LowerMMXCONCAT_VECTORS(Op, DAG);
 
   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
@@ -5517,9 +5616,9 @@
 }
 
 // Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                                          const X86Subtarget *Subtarget,
-                                          SelectionDAG &DAG) {
+static SDValue
+LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
@@ -5589,9 +5688,9 @@
 // 2. [ssse3] 1 x pshufb
 // 3. [ssse3] 2 x pshufb + 1 x por
 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
-                                            SelectionDAG &DAG) const {
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
@@ -5848,8 +5947,6 @@
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
@@ -5873,7 +5970,11 @@
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (V2IsUndef)
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5959,6 +6060,51 @@
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
 }
 
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !Subtarget->hasAVX2() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  SmallVector<SDValue, 32> pshufbMask;
+  for (unsigned i = 0; i != 32; i++) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0 || EltIdx >= 32)
+      EltIdx = 0x80;
+    else {
+      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+        // Cross lane is not allowed.
+        return SDValue();
+      EltIdx &= 0xf;
+    }
+    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  }
+  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                  MVT::v32i8, &pshufbMask[0], 32));
+}
+
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
@@ -6159,7 +6305,7 @@
   DebugLoc dl = SVOp->getDebugLoc();
   EVT VT = SVOp->getValueType(0);
 
-  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+  assert(VT.is128BitVector() && "Unsupported vector size");
 
   std::pair<int, int> Locs[4];
   int Mask1[] = { -1, -1, -1, -1 };
@@ -6293,17 +6439,17 @@
 }
 
 static bool MayFoldVectorLoad(SDValue V) {
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
+
   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     V = V.getOperand(0);
   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
     // BUILD_VECTOR (load), undef
     V = V.getOperand(0);
-  if (MayFoldLoad(V))
-    return true;
-  return false;
+
+  return MayFoldLoad(V);
 }
 
 // FIXME: the version above should always be used. Since there's
@@ -6426,6 +6572,81 @@
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasAVX2() && VT.is256BitVector())
+    return SDValue();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1U << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1U << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
+  }
+
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
+
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0)
+        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
 SDValue
 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -6456,6 +6677,11 @@
     return PromoteSplat(SVOp, DAG);
   }
 
+  // Check integer expanding shuffles.
+  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
   if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
@@ -6505,7 +6731,8 @@
   bool HasAVX    = Subtarget->hasAVX();
   bool HasAVX2   = Subtarget->hasAVX2();
   MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  bool OptForSize = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
 
   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
 
@@ -6774,7 +7001,7 @@
 
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -6785,9 +7012,15 @@
       return NewOp;
   }
 
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   // Handle all 128-bit wide vectors with 4 elements, and match them with
   // several different shuffle types.
-  if (NumElems == 4 && VT.getSizeInBits() == 128)
+  if (NumElems == 4 && VT.is128BitVector())
     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
 
   // Handle general 256-bit shuffles
@@ -6803,14 +7036,14 @@
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
-  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+  if (!Op.getOperand(0).getValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6825,9 +7058,9 @@
                                                  Op.getOperand(0)),
                                      Op.getOperand(1)));
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6873,7 +7106,7 @@
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
-  if (VecVT.getSizeInBits() == 256) {
+  if (VecVT.is256BitVector()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     unsigned NumElems = VecVT.getVectorNumElements();
     SDValue Idx = Op.getOperand(1);
@@ -6888,7 +7121,7 @@
                        DAG.getConstant(IdxVal, MVT::i32));
   }
 
-  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
 
   if (Subtarget->hasSSE41()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
@@ -6911,9 +7144,9 @@
     // Transform it so it match pextrw which produces a 32-bit result.
     EVT EltVT = MVT::i32;
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6964,7 +7197,7 @@
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
     return SDValue();
 
   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
@@ -7020,7 +7253,7 @@
 
   // If this is a 256-bit vector result, first extract the 128-bit vector,
   // insert the element into the extracted half and then place it back.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     if (!isa<ConstantSDNode>(N2))
       return SDValue();
 
@@ -7056,15 +7289,14 @@
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT OpVT = Op.getValueType();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
-  if (OpVT.getSizeInBits() > 128) {
+  if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     EVT VT128 = EVT::getVectorVT(*Context,
                                  OpVT.getVectorElementType(),
@@ -7081,7 +7313,7 @@
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!");
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
   return DAG.getNode(ISD::BITCAST, dl, OpVT,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
 }
@@ -7089,15 +7321,15 @@
 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
 // a simple subregister reference or explicit instructions to grab
 // upper bits of a vector.
-SDValue
-X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   if (Subtarget->hasAVX()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue Idx = Op.getNode()->getOperand(1);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 128 &&
-        Vec.getNode()->getValueType(0).getSizeInBits() == 256 &&
+    if (Op.getNode()->getValueType(0).is128BitVector() &&
+        Vec.getNode()->getValueType(0).is256BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Extract128BitVector(Vec, IdxVal, DAG, dl);
@@ -7109,16 +7341,16 @@
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
 // simple superregister reference or explicit instructions to insert
 // the upper bits of a vector.
-SDValue
-X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
   if (Subtarget->hasAVX()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 256 &&
-        SubVec.getNode()->getValueType(0).getSizeInBits() == 128 &&
+    if (Op.getNode()->getValueType(0).is256BitVector() &&
+        SubVec.getNode()->getValueType(0).is128BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
@@ -7253,9 +7485,10 @@
     Subtarget->ClassifyBlockAddressReference();
   CodeModel::Model M = getTargetMachine().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
-                                       /*isTarget=*/true, OpFlags);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -7364,8 +7597,8 @@
   SDValue InFlag;
   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-                                     DAG.getNode(X86ISD::GlobalBaseReg,
-                                                 DebugLoc(), PtrVT), InFlag);
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               DebugLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7866,11 +8099,29 @@
   return Sub;
 }
 
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  EVT SVT = N0.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
 
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   // the optimization here.
@@ -8044,10 +8295,66 @@
   }
 }
 
+SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue In = Op.getOperand(0);
+  EVT SVT = In.getValueType();
+
+  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  // AVX2 has better support of integer extending.
+  if (Subtarget->hasAVX2())
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+                           DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
+SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SVT = Op.getOperand(0).getValueType();
+
+  if (!VT.is128BitVector() || !SVT.is256BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                             NumElems * 2);
+
+  SDValue In = Op.getOperand(0);
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v8i16)
+      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+                                     MVT::v8i32, Op.getOperand(0)));
     return SDValue();
+  }
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
@@ -8082,26 +8389,49 @@
   return FIST;
 }
 
-SDValue X86TargetLowering::LowerFABS(SDValue Op,
-                                     SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue In = Op.getOperand(0);
+  EVT SVT = In.getValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  Constant *C;
-  if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2,
-                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
-  } else {
-    C = ConstantVector::getSplat(4,
-               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    NumElts = VT.getVectorNumElements();
   }
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
@@ -8121,12 +8451,13 @@
   else
     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
   if (VT.isVector()) {
-    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
                                    DAG.getNode(ISD::BITCAST, dl, XORVT,
@@ -8210,7 +8541,7 @@
   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
 }
 
-SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -8221,6 +8552,98 @@
   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
 }
 
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+  SmallVector<SDValue, 8> VecIns;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+    VecIns.push_back(I->first);
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8254,7 +8677,33 @@
 
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
-  switch (Op.getNode()->getOpcode()) {
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
   case ISD::ADD:
     // Due to an isel shortcoming, be conservative if this add is likely to be
     // selected as part of a load-modify-store instruction. When the root node
@@ -8274,7 +8723,7 @@
         goto default_case;
 
     if (ConstantSDNode *C =
-        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
       if (C->getAPIntValue() == 1) {
         Opcode = X86ISD::INC;
@@ -8310,7 +8759,7 @@
 
       if (User->getOpcode() != ISD::BRCOND &&
           User->getOpcode() != ISD::SETCC &&
-          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
         NonFlagUse = true;
         break;
       }
@@ -8331,14 +8780,20 @@
         goto default_case;
 
     // Otherwise use a regular EFLAGS-setting instruction.
-    switch (Op.getNode()->getOpcode()) {
+    switch (ArithOp.getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
-    case ISD::SUB:
-      Opcode = X86ISD::SUB;
-      break;
-    case ISD::OR:  Opcode = X86ISD::OR;  break;
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
     }
 
     NumOperands = 2;
@@ -8356,19 +8811,40 @@
     break;
   }
 
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
   if (Opcode == 0)
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
 
-  if (Opcode == X86ISD::CMP) {
-    SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
-                              Op.getOperand(1));
-    // We can't replace usage of SUB with CMP.
-    // The SUB node will be removed later because there is no use of it.
-    return SDValue(New.getNode(), 0);
-  }
-
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops;
   for (unsigned i = 0; i != NumOperands; ++i)
@@ -8554,7 +9030,7 @@
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
-  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -8591,10 +9067,12 @@
   DebugLoc dl = Op.getDebugLoc();
 
   if (isFP) {
-    unsigned SSECC = 8;
+#ifndef NDEBUG
     EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
 
+    unsigned SSECC;
     bool Swap = false;
 
     // SSE Condition code mapping:
@@ -8607,7 +9085,7 @@
     //  6 - NLE
     //  7 - ORD
     switch (SetCCOpcode) {
-    default: break;
+    default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETOEQ:
     case ISD::SETEQ:  SSECC = 0; break;
     case ISD::SETOGT:
@@ -8621,34 +9099,33 @@
     case ISD::SETUO:  SSECC = 3; break;
     case ISD::SETUNE:
     case ISD::SETNE:  SSECC = 4; break;
-    case ISD::SETULE: Swap = true;
+    case ISD::SETULE: Swap = true; // Fallthrough
     case ISD::SETUGE: SSECC = 5; break;
-    case ISD::SETULT: Swap = true;
+    case ISD::SETULT: Swap = true; // Fallthrough
     case ISD::SETUGT: SSECC = 6; break;
     case ISD::SETO:   SSECC = 7; break;
+    case ISD::SETUEQ:
+    case ISD::SETONE: SSECC = 8; break;
     }
     if (Swap)
       std::swap(Op0, Op1);
 
     // In the two special cases we can't handle, emit two comparisons.
     if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
       if (SetCCOpcode == ISD::SETUEQ) {
-        SDValue UNORD, EQ;
-        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                            DAG.getConstant(3, MVT::i8));
-        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                         DAG.getConstant(0, MVT::i8));
-        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      }
-      if (SetCCOpcode == ISD::SETONE) {
-        SDValue ORD, NEQ;
-        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(7, MVT::i8));
-        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(4, MVT::i8));
-        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
       }
-      llvm_unreachable("Illegal FP comparison");
+
+      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     }
     // Handle all other FP comparisons here.
     return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
@@ -8656,17 +9133,17 @@
   }
 
   // Break 256-bit integer vector compare into smaller ones.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
     return Lower256IntVSETCC(Op, DAG);
 
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
-  unsigned Opc = 0;
+  unsigned Opc;
   bool Swap = false, Invert = false, FlipSigns = false;
 
   switch (SetCCOpcode) {
-  default: break;
+  default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
@@ -8683,10 +9160,12 @@
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
-    return SDValue();
-  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
-    return SDValue();
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
+      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
+      return SDValue();
+  }
 
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   // bits of the inputs before performing those operations.
@@ -8924,6 +9403,21 @@
     }
   }
 
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
@@ -9278,7 +9772,8 @@
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
     Flag = Chain.getValue(1);
 
-    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                               SPTy).getValue(1);
 
     SDValue Ops1[2] = { Chain.getValue(0), Chain };
     return DAG.getMergeValues(Ops1, 2, dl);
@@ -9361,7 +9856,7 @@
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
+  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
@@ -9381,7 +9876,8 @@
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!getTargetMachine().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
-                .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
+                .getFunction()->getFnAttributes()
+                .hasAttribute(Attributes::NoImplicitFloat)) &&
            Subtarget->hasSSE1());
   }
 
@@ -9412,7 +9908,8 @@
                      false, false, false, 0);
 }
 
-SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   SDValue Chain = Op.getOperand(0);
@@ -9461,8 +9958,7 @@
   SDValue ShOps[4];
   ShOps[0] = ShAmt;
   ShOps[1] = DAG.getConstant(0, MVT::i32);
-  ShOps[2] = DAG.getUNDEF(MVT::i32);
-  ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
 
   // The return type has to be a 128-bit type with the same element
@@ -9474,8 +9970,7 @@
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-SDValue
-X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
@@ -9505,8 +10000,8 @@
   case Intrinsic::x86_sse2_ucomigt_sd:
   case Intrinsic::x86_sse2_ucomige_sd:
   case Intrinsic::x86_sse2_ucomineq_sd: {
-    unsigned Opc = 0;
-    ISD::CondCode CC = ISD::SETCC_INVALID;
+    unsigned Opc;
+    ISD::CondCode CC;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse_comieq_ss:
@@ -9580,55 +10075,102 @@
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+
   // Arithmetic intrinsics.
   case Intrinsic::x86_sse2_pmulu_dq:
   case Intrinsic::x86_avx2_pmulu_dq:
     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
+  // SSE3/AVX horizontal add/sub intrinsics
   case Intrinsic::x86_sse3_hadd_ps:
   case Intrinsic::x86_sse3_hadd_pd:
   case Intrinsic::x86_avx_hadd_ps_256:
   case Intrinsic::x86_avx_hadd_pd_256:
-    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse3_hsub_ps:
   case Intrinsic::x86_sse3_hsub_pd:
   case Intrinsic::x86_avx_hsub_ps_256:
   case Intrinsic::x86_avx_hsub_pd_256:
-    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_ssse3_phadd_w_128:
   case Intrinsic::x86_ssse3_phadd_d_128:
   case Intrinsic::x86_avx2_phadd_w:
   case Intrinsic::x86_avx2_phadd_d:
-    return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_ssse3_phsub_w_128:
   case Intrinsic::x86_ssse3_phsub_d_128:
   case Intrinsic::x86_avx2_phsub_w:
-  case Intrinsic::x86_avx2_phsub_d:
-    return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_phsub_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse3_hadd_ps:
+    case Intrinsic::x86_sse3_hadd_pd:
+    case Intrinsic::x86_avx_hadd_ps_256:
+    case Intrinsic::x86_avx_hadd_pd_256:
+      Opcode = X86ISD::FHADD;
+      break;
+    case Intrinsic::x86_sse3_hsub_ps:
+    case Intrinsic::x86_sse3_hsub_pd:
+    case Intrinsic::x86_avx_hsub_ps_256:
+    case Intrinsic::x86_avx_hsub_pd_256:
+      Opcode = X86ISD::FHSUB;
+      break;
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_avx2_phadd_w:
+    case Intrinsic::x86_avx2_phadd_d:
+      Opcode = X86ISD::HADD;
+      break;
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_avx2_phsub_w:
+    case Intrinsic::x86_avx2_phsub_d:
+      Opcode = X86ISD::HSUB;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // AVX2 variable shift intrinsics
   case Intrinsic::x86_avx2_psllv_d:
   case Intrinsic::x86_avx2_psllv_q:
   case Intrinsic::x86_avx2_psllv_d_256:
   case Intrinsic::x86_avx2_psllv_q_256:
-    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_avx2_psrlv_d:
   case Intrinsic::x86_avx2_psrlv_q:
   case Intrinsic::x86_avx2_psrlv_d_256:
   case Intrinsic::x86_avx2_psrlv_q_256:
-    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx2_psllv_q_256:
+      Opcode = ISD::SHL;
+      break;
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+      Opcode = ISD::SRL;
+      break;
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+      Opcode = ISD::SRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
     return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_ssse3_psign_b_128:
   case Intrinsic::x86_ssse3_psign_w_128:
   case Intrinsic::x86_ssse3_psign_d_128:
@@ -9637,15 +10179,18 @@
   case Intrinsic::x86_avx2_psign_d:
     return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_sse41_insertps:
     return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
   case Intrinsic::x86_avx_vperm2f128_ps_256:
   case Intrinsic::x86_avx_vperm2f128_pd_256:
   case Intrinsic::x86_avx_vperm2f128_si_256:
   case Intrinsic::x86_avx2_vperm2i128:
     return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
@@ -9675,7 +10220,7 @@
   case Intrinsic::x86_avx_vtestc_pd_256:
   case Intrinsic::x86_avx_vtestnzc_pd_256: {
     bool IsTestPacked = false;
-    unsigned X86CC = 0;
+    unsigned X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx_vtestz_ps:
@@ -9726,100 +10271,93 @@
   case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
-    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse2_psrl_w:
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
-    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse2_psra_w:
   case Intrinsic::x86_sse2_psra_d:
   case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_psra_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+      Opcode = X86ISD::VSHL;
+      break;
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+      Opcode = X86ISD::VSRL;
+      break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+      Opcode = X86ISD::VSRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/AVX immediate shift intrinsics
   case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
-    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
   case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
-    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_sse2_psrai_d:
   case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
-
-    unsigned NewIntNo = 0;
+  case Intrinsic::x86_avx2_psrai_d: {
+    unsigned Opcode;
     switch (IntNo) {
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+      Opcode = X86ISD::VSHLI;
       break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+      Opcode = X86ISD::VSRLI;
       break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+      Opcode = X86ISD::VSRAI;
       break;
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
+    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
   }
+
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -9884,6 +10422,7 @@
                                 SDValue(PCMP.getNode(), 1));
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+
   case Intrinsic::x86_sse42_pcmpistri128:
   case Intrinsic::x86_sse42_pcmpestri128: {
     unsigned Opcode;
@@ -9897,11 +10436,78 @@
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
   }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   }
 }
 
-SDValue
-X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
@@ -9939,21 +10545,21 @@
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-      DAG.getConstant(TD->getPointerSize(),
-                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
                                    FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
 }
 
@@ -9975,7 +10581,7 @@
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  return DAG.getIntPtrConstant(2*TD->getPointerSize());
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -9990,7 +10596,7 @@
   unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
-                                  DAG.getIntPtrConstant(TD->getPointerSize()));
+                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
@@ -10001,8 +10607,22 @@
                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
 }
 
-SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   return Op.getOperand(0);
 }
 
@@ -10015,6 +10635,7 @@
   DebugLoc dl  = Op.getDebugLoc();
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -10023,8 +10644,8 @@
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
-    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
-    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
@@ -10097,7 +10718,7 @@
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+          if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
 
@@ -10126,7 +10747,7 @@
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
-    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
     OutChains[0] = DAG.getStore(Root, dl,
                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                 Trmp, MachinePointerInfo(TrmpAddr),
@@ -10225,7 +10846,7 @@
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
-SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -10259,8 +10880,7 @@
   return Op;
 }
 
-SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
-                                                SelectionDAG &DAG) const {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -10285,7 +10905,7 @@
   return Op;
 }
 
-SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   unsigned NumBits = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
@@ -10310,7 +10930,7 @@
 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
-  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+  assert(VT.is256BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -10334,25 +10954,26 @@
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
-SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
-SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
-SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
     return Lower256IntArith(Op, DAG);
 
   assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
@@ -10582,7 +11203,7 @@
   }
 
   // Decompose 256-bit shifts into smaller 128-bit shifts.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     unsigned NumElems = VT.getVectorNumElements();
     MVT EltVT = VT.getVectorElementType().getSimpleVT();
     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10623,7 +11244,7 @@
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   // looks for this combo and may remove the "setcc" instruction if the "setcc"
@@ -10738,7 +11359,7 @@
         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
 
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
       }
       // fall through
     case MVT::v4i32:
@@ -10751,7 +11372,8 @@
 }
 
 
-SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
+static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
+                              SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
 
   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
@@ -10796,8 +11418,8 @@
   return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 }
 
-SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
-                                             SelectionDAG &DAG) const {
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
@@ -10835,7 +11457,8 @@
 }
 
 
-SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
   EVT T = Op.getValueType();
   DebugLoc DL = Op.getDebugLoc();
   unsigned Reg = 0;
@@ -10866,8 +11489,8 @@
   return cpOut;
 }
 
-SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
   assert(Subtarget->is64Bit() && "Result not type legalized?");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue TheChain = Op.getOperand(0);
@@ -10885,8 +11508,7 @@
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
-                                            SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
   EVT DstVT = Op.getValueType();
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
@@ -10906,7 +11528,7 @@
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   DebugLoc dl = Node->getDebugLoc();
   EVT T = Node->getValueType(0);
@@ -10979,9 +11601,9 @@
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
-  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
-  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
+  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, Subtarget, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -10989,8 +11611,8 @@
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
@@ -11002,8 +11624,11 @@
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return lowerZERO_EXTEND(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
   case ISD::FABS:               return LowerFABS(Op, DAG);
   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -11014,7 +11639,7 @@
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
-  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
@@ -11023,13 +11648,15 @@
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
-  case ISD::MUL:                return LowerMUL(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, DAG);
@@ -11039,7 +11666,7 @@
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
-  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
@@ -11071,9 +11698,9 @@
   Results.push_back(Swap.getValue(1));
 }
 
-void X86TargetLowering::
+static void
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) const {
+                        SelectionDAG &DAG, unsigned NewOp) {
   DebugLoc dl = Node->getDebugLoc();
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
@@ -11132,6 +11759,27 @@
     }
     return;
   }
+  case ISD::UINT_TO_FP: {
+    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
   case ISD::READCYCLECOUNTER: {
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue TheChain = N->getOperand(0);
@@ -11194,26 +11842,56 @@
     return;
   }
   case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
-    return;
-  case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_SWAP: {
+    unsigned Opc;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::ATOMIC_LOAD_ADD:
+      Opc = X86ISD::ATOMADD64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Opc = X86ISD::ATOMAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_NAND:
+      Opc = X86ISD::ATOMNAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_OR:
+      Opc = X86ISD::ATOMOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_SUB:
+      Opc = X86ISD::ATOMSUB64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Opc = X86ISD::ATOMXOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_MAX:
+      Opc = X86ISD::ATOMMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_MIN:
+      Opc = X86ISD::ATOMMIN64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMAX:
+      Opc = X86ISD::ATOMUMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMIN:
+      Opc = X86ISD::ATOMUMIN64_DAG;
+      break;
+    case ISD::ATOMIC_SWAP:
+      Opc = X86ISD::ATOMSWAP64_DAG;
+      break;
+    }
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
     return;
+  }
   case ISD::ATOMIC_LOAD:
     ReplaceATOMIC_LOAD(N, Results, DAG);
   }
@@ -11273,11 +11951,15 @@
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
@@ -11291,7 +11973,12 @@
   case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
   case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -11358,6 +12045,8 @@
   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   }
 }
 
@@ -11493,7 +12182,7 @@
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
     return true;
-  if (NumElts == 4 && VT.getSizeInBits() == 128) {
+  if (NumElts == 4 && VT.is128BitVector()) {
     return (isMOVLMask(Mask, VT)  ||
             isCommutedMOVLMask(Mask, VT, true) ||
             isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
@@ -11506,430 +12195,724 @@
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
-// private utility function
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpc,
-                                                       unsigned immOpc,
-                                                       unsigned LoadOpc,
-                                                       unsigned CXchgOpc,
-                                                       unsigned notOpc,
-                                                       unsigned EAXreg,
-                                                 const TargetRegisterClass *RC,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld  t1 = [bitinstr.addr]
-  //     op  t2 = t1, [bitinstr.val]
-  //     not t3 = t2  (if Invert)
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
-  //     bz  newMBB
-  //     fallthrough -->nextMBB
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI->getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  // Insert instructions into newMBB based on incoming instruction
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  DebugLoc dl = bInstr->getDebugLoc();
-  MachineOperand& destOper = bInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = bInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &bInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
-  MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
 
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  if (Invert) {
-    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
-  }
-  else
-    t3 = t2;
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
-  MIB.addReg(t1);
+  MI->eraseFromParent();
+  return sinkMBB;
+}
 
-  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::LCMPXCHG8;
+  case MVT::i16: return X86::LCMPXCHG16;
+  case MVT::i32: return X86::LCMPXCHG32;
+  case MVT::i64: return X86::LCMPXCHG64;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(EAXreg);
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::MOV8rm;
+  case MVT::i16: return X86::MOV16rm;
+  case MVT::i32: return X86::MOV32rm;
+  case MVT::i64: return X86::MOV64rm;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+  switch (Opc) {
+  case X86::ATOMAND8:  return X86::AND8rr;
+  case X86::ATOMAND16: return X86::AND16rr;
+  case X86::ATOMAND32: return X86::AND32rr;
+  case X86::ATOMAND64: return X86::AND64rr;
+  case X86::ATOMOR8:   return X86::OR8rr;
+  case X86::ATOMOR16:  return X86::OR16rr;
+  case X86::ATOMOR32:  return X86::OR32rr;
+  case X86::ATOMOR64:  return X86::OR64rr;
+  case X86::ATOMXOR8:  return X86::XOR8rr;
+  case X86::ATOMXOR16: return X86::XOR16rr;
+  case X86::ATOMXOR32: return X86::XOR32rr;
+  case X86::ATOMXOR64: return X86::XOR64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+                                               unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
+  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
+  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
+  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
+  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
+  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
+  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
+  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
+  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+  switch (Opc) {
+  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
+  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
+  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
+  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
+  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
+  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
+  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
+  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
+  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+                                                   unsigned &HiOpc,
+                                                   unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND6432:
+    ExtraOpc = X86::NOT32r;
+    HiOpc = X86::AND32rr;
+    return X86::AND32rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::CMOV_GR8;
+  case MVT::i16: return X86::CMOV_GR16;
+  case MVT::i32: return X86::CMOV_GR32;
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown CMOV opcode!");
 }
 
-// private utility function:  64 bit atomics on 32 bit host.
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD MI.addr
+// loop:
+//    t1 = OP MI.val, EAX
+//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+//    JNE loop
+// sink:
+//    dst = EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpcL,
-                                                       unsigned regOpcH,
-                                                       unsigned immOpcL,
-                                                       unsigned immOpcH,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB (instructions are in pairs, except cmpxchg8b)
-  //     ld t1,t2 = [bitinstr.addr]
-  //   newMBB:
-  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
-  //     op  t5, t6 <- out1, out2, [bitinstr.val]
-  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
-  //     neg t7, t8 < t5, t6  (if Invert)
-  //     mov ECX, EBX <- t5, t6
-  //     mov EAX, EDX <- t1, t2
-  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
-  //     mov t3, t4 <- EAX, EDX
-  //     bz  newMBB
-  //     result in out1, out2
-  //     fallthrough -->nextMBB
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const unsigned LoadOpc = X86::MOV32rm;
-  const unsigned NotOpc = X86::NOT32r;
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg, SrcReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  MVT::SimpleValueType VT = *RC->vt_begin();
+  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+  unsigned LOADOpc = getLoadOpcode(VT);
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr]
+  //  mainMBB:
+  //    t1 = OP MI.val, EAX
+  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = bInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
-         "unexpected number of operands");
-  MachineOperand& dest1Oper = bInstr->getOperand(0);
-  MachineOperand& dest2Oper = bInstr->getOperand(1);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
-    argOpers[i] = &bInstr->getOperand(i+2);
-
-    // We use some of the operands multiple times, so conservatively just
-    // clear any kill flags that might be present.
-    if (argOpers[i]->isReg() && argOpers[i]->isUse())
-      argOpers[i]->setIsKill(false);
-  }
-
-  // x86 address has 5 operands: base, index, scale, displacement, and segment.
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
-  // add 4 to displacement.
-  for (int i=0; i <= lastAddrIndx-2; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MachineOperand newOp3 = *(argOpers[3]);
-  if (newOp3.isImm())
-    newOp3.setImm(newOp3.getImm()+4);
-  else
-    newOp3.setOffset(newOp3.getOffset()+4);
-  (*MIB).addOperand(newOp3);
-  (*MIB).addOperand(*argOpers[lastAddrIndx]);
-
-  // t3/4 are defined later, at the bottom of the loop
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
-    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
-    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
-
-  // The subsequent operations should be using the destination registers of
-  // the PHI instructions.
-  t1 = dest1Oper.getReg();
-  t2 = dest2Oper.getReg();
-
-  int valArgIndx = lastAddrIndx + 1;
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
-  if (regOpcL != X86::MOV32rr)
-    MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
-  assert(argOpers[valArgIndx + 1]->isReg() ==
-         argOpers[valArgIndx]->isReg());
-  assert(argOpers[valArgIndx + 1]->isImm() ==
-         argOpers[valArgIndx]->isImm());
-  if (argOpers[valArgIndx + 1]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
-  if (regOpcH != X86::MOV32rr)
-    MIB.addReg(t2);
-  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
-
-  unsigned t7, t8;
-  if (Invert) {
-    t7 = F->getRegInfo().createVirtualRegister(RC);
-    t8 = F->getRegInfo().createVirtualRegister(RC);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
-  } else {
-    t7 = t5;
-    t8 = t6;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(AccPhyReg);
+
+  // Copy AccPhyReg as it is used more than once.
+  unsigned AccReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
+    .addReg(AccPhyReg);
+
+  unsigned t1 = MRI.createVirtualRegister(RC);
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op opcode!");
+  case X86::ATOMAND8:
+  case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+  case X86::ATOMOR8:
+  case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+  case X86::ATOMXOR8:
+  case X86::ATOMXOR16:
+  case X86::ATOMXOR32:
+  case X86::ATOMXOR64: {
+    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
+      .addReg(AccReg);
+    break;
   }
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64: {
+    unsigned t2 = MRI.createVirtualRegister(RC);
+    unsigned NOTOpc;
+    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
+      .addReg(AccReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+    break;
+  }
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64: {
+    unsigned CMPOpc;
+    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+    BuildMI(mainMBB, DL, TII->get(CMPOpc))
+      .addReg(SrcReg)
+      .addReg(AccReg);
+
+    if (Subtarget->hasCMov()) {
+      if (VT != MVT::i8) {
+        // Native support
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+          .addReg(SrcReg)
+          .addReg(AccReg);
+      } else {
+        // Promote i8 to i32 to use CMOV32
+        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+        unsigned t2 = MRI.createVirtualRegister(RC32);
+
+        unsigned Undef = MRI.createVirtualRegister(RC32);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+          .addReg(Undef)
+          .addReg(SrcReg)
+          .addImm(X86::sub_8bit);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+          .addReg(Undef)
+          .addReg(AccReg)
+          .addImm(X86::sub_8bit);
+
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+          .addReg(SrcReg32)
+          .addReg(AccReg32);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
-  MIB.addReg(t2);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
+          .addReg(t2, 0, X86::sub_8bit);
+      }
+    } else {
+      // Use pseudo select and lower them.
+      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+             "Invalid atomic-load-op transformation!");
+      unsigned SelOpc = getPseudoCMOVOpc(VT);
+      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
+              .addReg(SrcReg).addReg(AccReg)
+              .addImm(CC);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    break;
+  }
+  }
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
-  MIB.addReg(t7);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
-  MIB.addReg(t8);
+  // Copy AccPhyReg back from virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
+    .addReg(AccReg);
+
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.addReg(t1);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
-  MIB.addReg(X86::EAX);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
-  MIB.addReg(X86::EDX);
+  // sinkMBB:
+  sinkMBB->addLiveIn(AccPhyReg);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstReg)
+    .addReg(AccPhyReg);
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
-// private utility function
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD [MI.addr + 0]
+//    EDX = LOAD [MI.addr + 4]
+// loop:
+//    EBX = OP MI.val.lo, EAX
+//    ECX = OP MI.val.hi, EDX
+//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+//    JNE loop
+// sink:
+//    dst = EDX:EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
-                                                      MachineBasicBlock *MBB,
-                                                      unsigned cmovOpc) const {
-  // For the atomic min/max operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld t1 = [min/max.addr]
-  //     mov t2 = [min/max.val]
-  //     cmp  t1, t2
-  //     cmov[cond] t2 = t1
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
-  //     bz   newMBB
-  //     fallthrough -->nextMBB
-  //
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op32 to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstLoReg, DstHiReg;
+  unsigned SrcLoReg, SrcHiReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstLoReg = MI->getOperand(CurOp++).getReg();
+  DstHiReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcLoReg = MI->getOperand(CurOp++).getReg();
+  SrcHiReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
+  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+
+  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+  unsigned LOADOpc = X86::MOV32rm;
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr + 0]
+  //    EDX = LOAD [MI.addr + 4]
+  //  mainMBB:
+  //    EBX = OP MI.vallo, EAX
+  //    ECX = OP MI.valhi, EDX
+  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(mInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to newMBB and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = mInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  MachineOperand& destOper = mInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = mInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &mInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  // We only support register and immediate values
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
+  MachineInstrBuilder MIB;
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
-  MIB.addReg(t1);
-  MIB.addReg(t2);
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  // Lo
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Hi
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
-  MIB.addReg(t2);
-  MIB.addReg(t1);
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(X86::EAX);
+  mainMBB->addLiveIn(X86::EDX);
+
+  // Copy EDX:EAX as they are used more than once.
+  unsigned LoReg = MRI.createVirtualRegister(RC);
+  unsigned HiReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
 
-  // Cmp and exchange if none has modified the memory location
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(mInstr->memoperands_begin(),
-                    mInstr->memoperands_end());
+  unsigned t1L = MRI.createVirtualRegister(RC);
+  unsigned t1H = MRI.createVirtualRegister(RC);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(X86::EAX);
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+  case X86::ATOMAND6432:
+  case X86::ATOMOR6432:
+  case X86::ATOMXOR6432:
+  case X86::ATOMADD6432:
+  case X86::ATOMSUB6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
+    break;
+  }
+  case X86::ATOMNAND6432: {
+    unsigned HiOpc, NOTOpc;
+    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+    unsigned t2L = MRI.createVirtualRegister(RC);
+    unsigned t2H = MRI.createVirtualRegister(RC);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+    break;
+  }
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    unsigned cL = MRI.createVirtualRegister(RC8);
+    unsigned cH = MRI.createVirtualRegister(RC8);
+    unsigned cL32 = MRI.createVirtualRegister(RC);
+    unsigned cH32 = MRI.createVirtualRegister(RC);
+    unsigned cc = MRI.createVirtualRegister(RC);
+    // cl := cmp src_lo, lo
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
+    // ch := cmp src_hi, hi
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcHiReg).addReg(HiReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
+    // cc := if (src_hi == hi) ? cl : ch;
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
+        .addReg(cH32).addReg(cL32);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
+              .addReg(cH32).addReg(cL32)
+              .addImm(X86::COND_E);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
+        .addReg(SrcLoReg).addReg(LoReg);
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
+        .addReg(SrcHiReg).addReg(HiReg);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
+              .addReg(SrcLoReg).addReg(LoReg)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
+              .addReg(SrcHiReg).addReg(HiReg)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    break;
+  }
+  case X86::ATOMSWAP6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+    break;
+  }
+  }
+
+  // Copy EDX:EAX back from HiReg:LoReg
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+  // Copy ECX:EBX from t1H:t1L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
+
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  sinkMBB->addLiveIn(X86::EAX);
+  sinkMBB->addLiveIn(X86::EDX);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstLoReg)
+    .addReg(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstHiReg)
+    .addReg(X86::EDX);
 
-  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
-MachineBasicBlock *
-X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
-                            unsigned numArgs, bool memArg) const {
-  assert(Subtarget->hasSSE42() &&
-         "Target must have SSE4.2 or AVX features enabled");
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
 
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
   unsigned Opc;
-  if (!Subtarget->hasAVX()) {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
-  } else {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   }
 
+  DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-  for (unsigned i = 0; i < numArgs; ++i) {
-    MachineOperand &Op = MI->getOperand(i+1);
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
   BuildMI(*BB, MI, dl,
     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::XMM0);
+    .addReg(X86::ECX);
 
   MI->eraseFromParent();
   return BB;
 }
 
-MachineBasicBlock *
-X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII,
+                                       const X86Subtarget* Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
@@ -12618,6 +13601,203 @@
 }
 
 MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = getTargetMachine().getRelocationModel();
+  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
@@ -12741,188 +13921,106 @@
     // String/text processing lowering.
   case X86::PCMPISTRM128REG:
   case X86::VPCMPISTRM128REG:
-    return EmitPCMP(MI, BB, 3, false /* in-mem */);
   case X86::PCMPISTRM128MEM:
   case X86::VPCMPISTRM128MEM:
-    return EmitPCMP(MI, BB, 3, true /* in-mem */);
   case X86::PCMPESTRM128REG:
   case X86::VPCMPESTRM128REG:
-    return EmitPCMP(MI, BB, 5, false /* in mem */);
   case X86::PCMPESTRM128MEM:
   case X86::VPCMPESTRM128MEM:
-    return EmitPCMP(MI, BB, 5, true /* in mem */);
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
 
-    // Thread synchronization.
+  // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB);
+    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
 
-    // Atomic Lowering.
-  case X86::ATOMAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
-                                               X86::OR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMXOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
-                                               X86::XOR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMNAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass, true);
-  case X86::ATOMMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
-  case X86::ATOMMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
-  case X86::ATOMUMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
-  case X86::ATOMUMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
 
+  // Atomic Lowering.
+  case X86::ATOMAND8:
   case X86::ATOMAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+    // Fall through
+  case X86::ATOMOR8:
   case X86::ATOMOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
-                                               X86::OR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+    // Fall through
   case X86::ATOMXOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
-                                               X86::XOR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
-  case X86::ATOMNAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass, true);
-  case X86::ATOMMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
-  case X86::ATOMMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
-  case X86::ATOMUMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
-  case X86::ATOMUMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
-
-  case X86::ATOMAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
-  case X86::ATOMOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
-                                               X86::OR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
   case X86::ATOMXOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
-                                               X86::XOR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
-  case X86::ATOMNAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass, true);
-  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
-  // This group is for 64-bit host.
-  case X86::ATOMAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
-  case X86::ATOMOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
-                                               X86::OR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
+  case X86::ATOMXOR32:
   case X86::ATOMXOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
-                                               X86::XOR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
+    // Fall through
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
   case X86::ATOMNAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass, true);
-  case X86::ATOMMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+    // Fall through
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
   case X86::ATOMMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
-  case X86::ATOMUMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+    // Fall through
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+    // Fall through
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
   case X86::ATOMUMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+    // Fall through
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64:
+    return EmitAtomicLoadArith(MI, BB);
 
   // This group does 64-bit operations on a 32-bit host.
   case X86::ATOMAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               false);
   case X86::ATOMOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::OR32rr, X86::OR32rr,
-                                               X86::OR32ri, X86::OR32ri,
-                                               false);
   case X86::ATOMXOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::XOR32rr, X86::XOR32rr,
-                                               X86::XOR32ri, X86::XOR32ri,
-                                               false);
   case X86::ATOMNAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               true);
   case X86::ATOMADD6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::ADD32rr, X86::ADC32rr,
-                                               X86::ADD32ri, X86::ADC32ri,
-                                               false);
   case X86::ATOMSUB6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::SUB32rr, X86::SBB32rr,
-                                               X86::SUB32ri, X86::SBB32ri,
-                                               false);
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432:
   case X86::ATOMSWAP6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::MOV32rr, X86::MOV32rr,
-                                               X86::MOV32ri, X86::MOV32ri,
-                                               false);
+    return EmitAtomicLoadArith6432(MI, BB);
+
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
   }
 }
 
@@ -13150,12 +14248,12 @@
     return SDValue();
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+  if (Subtarget->hasAVX() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
   // Only handle 128 wide vector from here on.
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
     return SDValue();
 
   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
@@ -13169,12 +14267,12 @@
 }
 
 
-/// DCI, PerformTruncateCombine - Converts truncate operation to
+/// PerformTruncateCombine - Converts truncate operation to
 /// a sequence of vector shuffle operations.
 /// It is possible when we truncate 256-bit vector to 128-bit vector
-
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
-                                                  DAGCombinerInfo &DCI) const {
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget *Subtarget)  {
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -13215,8 +14313,9 @@
     // PSHUFD
     static const int ShufMask1[] = {0, 2, 0, 0};
 
-    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1);
+    SDValue Undef = DAG.getUNDEF(VT);
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
 
     // MOVLHPS
     static const int ShufMask2[] = {0, 1, 4, 5};
@@ -13274,10 +14373,9 @@
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
@@ -13366,7 +14464,7 @@
     // alignment is valid.
     unsigned Align = LN0->getAlignment();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    unsigned NewAlign = TLI.getTargetData()->
+    unsigned NewAlign = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
@@ -13397,6 +14495,14 @@
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
+  // Detect whether we are trying to convert from mmx to i32 and the bitcast
+  // from mmx to v2i32 has a single usage.
+  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+                       N->getValueType(0),
+                       InputVector.getNode()->getOperand(0));
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
@@ -13782,9 +14888,106 @@
   return SDValue();
 }
 
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = 0;
+  bool needOppositeCond = (CC == X86::COND_E);
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1)
+    needOppositeCond = !needOppositeCond;
+  else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  // Skip 'zext' node.
+  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
+    SetCC = SetCC.getOperand(0);
+
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      // A special case for rdrand, where 0 is set if false cond is found.
+      SDValue Op = SetCC.getOperand(0);
+      if (Op.getOpcode() != X86ISD::RDRAND)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
 
   // If the flag operand isn't dead, don't touch this CMOV.
@@ -13795,6 +14998,7 @@
   SDValue TrueOp = N->getOperand(1);
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   SDValue Cond = N->getOperand(3);
+
   if (CC == X86::COND_E || CC == X86::COND_NE) {
     switch (Cond.getOpcode()) {
     default: break;
@@ -13806,6 +15010,18 @@
     }
   }
 
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
@@ -13816,6 +15032,7 @@
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
       }
 
       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
@@ -13898,6 +15115,46 @@
       }
     }
   }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = 0;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
+                           array_lengthof(Ops));
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -14228,7 +15485,7 @@
 
   // Sometimes the operand may come from a insert_subvector building a 256-bit
   // allones vector
-  if (VT.getSizeInBits() == 256 &&
+  if (VT.is256BitVector() &&
       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
     SDValue V1 = N->getOperand(0);
     SDValue V2 = N->getOperand(1);
@@ -14554,11 +15811,11 @@
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSE4 for the shuffles.
+  // shuffle. We need SSSE3 shuffles.
   // TODO: It is possible to support ZExt by zeroing the undef values
   // during the shuffle phase or after the shuffle.
   if (RegVT.isVector() && RegVT.isInteger() &&
-      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
     assert(MemVT != RegVT && "Cannot extend to the same type");
     assert(MemVT.isVector() && "Must load a vector from memory");
 
@@ -14673,7 +15930,7 @@
   // On Sandy Bridge, 256-bit memory operations are executed by two
   // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
   // memory  operation.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() &&
+  if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
       StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
       StoredVal.getNumOperands() == 2) {
     SDValue Value0 = StoredVal.getOperand(0);
@@ -14784,7 +16041,8 @@
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  bool NoImplicitFloatOps = F->getFnAttributes().
+    hasAttribute(Attributes::NoImplicitFloat);
   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                      && Subtarget->hasSSE2();
   if ((VT.isVector() ||
@@ -15056,6 +16314,29 @@
   return SDValue();
 }
 
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+
 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   // FAND(0.0, x) -> 0.0
@@ -15131,19 +16412,19 @@
     // concat the vectors to original VT
 
     unsigned NumElems = OpVT.getVectorNumElements();
+    SDValue Undef = DAG.getUNDEF(OpVT);
+
     SmallVector<int,8> ShufMask1(NumElems, -1);
     for (unsigned i = 0; i != NumElems/2; ++i)
       ShufMask1[i] = i;
 
-    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        &ShufMask1[0]);
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
 
     SmallVector<int,8> ShufMask2(NumElems, -1);
     for (unsigned i = 0; i != NumElems/2; ++i)
       ShufMask2[i] = i + NumElems/2;
 
-    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        &ShufMask2[0]);
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                   VT.getVectorNumElements()/2);
@@ -15161,8 +16442,13 @@
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -15184,9 +16470,10 @@
 
   unsigned Opcode;
   if (!NegMul)
-    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   else
-    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
   return DAG.getNode(Opcode, dl, VT, A, B, C);
 }
 
@@ -15283,36 +16570,77 @@
   return SDValue();
 }
 
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 
+// as "sbb reg,reg", since it can be extended without zext and produces 
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+  return DAG.getNode(ISD::AND, DL, MVT::i8,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+                     DAG.getConstant(1, MVT::i8));
+}
+
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned X86CC = N->getConstantOperandVal(0);
-  SDValue EFLAG = N->getOperand(1);
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate 
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG);
+    }
+  }
 
   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   // cases.
-  if (X86CC == X86::COND_B)
-    return DAG.getNode(ISD::AND, DL, MVT::i8,
-                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
-                       DAG.getConstant(1, MVT::i8));
+  if (CC == X86::COND_B)
+    return MaterializeSETB(DL, EFLAGS, DAG);
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
 
   return SDValue();
 }
 
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op0 = N->getOperand(0);
-  EVT InVT = Op0->getValueType(0);
-
-  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
-  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
-    // Notice that we use SINT_TO_FP because we know that the high bits
-    // are zero and SINT_TO_FP is better supported by the hardware.
-    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
   }
 
   return SDValue();
@@ -15349,20 +16677,6 @@
   return SDValue();
 }
 
-static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
-  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
-  }
-
-  return SDValue();
-}
-
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15477,6 +16791,21 @@
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget *Subtarget) {
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue In = N->getOperand(0);
+  while (In.getOpcode() == ISD::BITCAST)
+    In = In.getOperand(0);
+
+  if (In.getOpcode() != X86ISD::VZEXT)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -15486,7 +16815,7 @@
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -15499,22 +16828,24 @@
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
-  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
-  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
   case X86ISD::UNPCKH:
@@ -15942,7 +17273,7 @@
     return;
   case 'K':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+      if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
         break;
       }
@@ -16267,3 +17598,207 @@
 
   return Res;
 }
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+struct X86CostTblEntry {
+  int ISD;
+  MVT Type;
+  unsigned Cost;
+};
+
+static int
+FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+struct X86TypeConversionCostTblEntry {
+  int ISD;
+  MVT Dst;
+  MVT Src;
+  unsigned Cost;
+};
+
+static int
+FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
+                   int ISD, MVT Dst, MVT Src) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+unsigned
+X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+                                                     Type *Ty) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::MUL,     MVT::v4i64,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    };
+
+  // Look for AVX1 lowering tricks.
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
+                          LT.second);
+    if (Idx != -1)
+      return LT.first * AVX1CostTable[Idx].Cost;
+  }
+  // Fallback to the default implementation.
+  return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned
+X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                 unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
+                                                          Type *ValTy,
+                                                          Type *CondTy) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const X86CostTblEntry AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const X86CostTblEntry AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  if (ST.hasSSE42()) {
+    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * SSE42CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX1CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX2()) {
+    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX2CostTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
+                                                        Type *Dst,
+                                                        Type *Src) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
+  };
+
+  if (ST.hasAVX()) {
+    int Idx = FindInConvertTable(AVXConversionTbl,
+                                 array_lengthof(AVXConversionTbl),
+                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVXConversionTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+}
+

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86ISelLowering.h Tue Jan 15 11:16:16 2013
@@ -19,6 +19,7 @@
 #include "X86RegisterInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -137,15 +138,15 @@
       /// relative displacements.
       WrapperRIP,
 
-      /// MOVQ2DQ - Copies a 64-bit value from an MMX vector to the low word
-      /// of an XMM vector, with the high word zero filled.
-      MOVQ2DQ,
-
       /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
       /// to an MMX vector.  If you think this is too close to the previous
       /// mnemonic, so do I; blame Intel.
       MOVDQ2Q,
 
+      /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
       /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
       /// i32, corresponds to X86::PEXTRB.
       PEXTRB,
@@ -199,6 +200,9 @@
       ///
       FMAX, FMIN,
 
+      /// FMAXC, FMINC - Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
       /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
       /// approximation.  Note that these typically require refinement
       /// in order to obtain suitable precision.
@@ -218,6 +222,12 @@
       // EH_RETURN - Exception Handling helpers.
       EH_RETURN,
 
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
       /// TC_RETURN - Tail call return.
       ///   operand #0 chain
       ///   operand #1 callee (register or absolute)
@@ -231,6 +241,18 @@
       // VSEXT_MOVL - Vector move low and sign extend.
       VSEXT_MOVL,
 
+      // VZEXT - Vector integer zero-extend.
+      VZEXT,
+
+      // VSEXT - Vector integer signed-extend.
+      VSEXT,
+
+      // VFPEXT - Vector FP extend.
+      VFPEXT,
+
+      // VFPROUND - Vector FP round.
+      VFPROUND,
+
       // VSHL, VSRL - 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
 
@@ -346,6 +368,10 @@
       ATOMXOR64_DAG,
       ATOMAND64_DAG,
       ATOMNAND64_DAG,
+      ATOMMAX64_DAG,
+      ATOMMIN64_DAG,
+      ATOMUMAX64_DAG,
+      ATOMUMIN64_DAG,
       ATOMSWAP64_DAG,
 
       // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
@@ -459,10 +485,6 @@
     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                  unsigned JTI, MCContext &Ctx) const;
 
-    /// getStackPtrReg - Return the stack pointer register we are using: either
-    /// ESP or RSP.
-    unsigned getStackPtrReg() const { return X86StackPtr; }
-
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
@@ -695,10 +717,7 @@
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
     const X86RegisterInfo *RegInfo;
-    const TargetData *TD;
-
-    /// X86StackPtr - X86 physical register used as stack ptr.
-    unsigned X86StackPtr;
+    const DataLayout *TD;
 
     /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
     /// floating point ops.
@@ -742,6 +761,7 @@
                                            bool isVarArg,
                                            bool isCalleeStructRet,
                                            bool isCallerStructRet,
+                                           Type *RetTy,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -761,15 +781,11 @@
     SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                                    SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
@@ -783,12 +799,15 @@
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       DebugLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -800,37 +819,26 @@
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const;
 
-    // Utility functions to help LowerVECTOR_SHUFFLE
-    SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const;
+    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
+    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
+    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -863,54 +871,17 @@
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
-    void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                 SelectionDAG &DAG, unsigned NewOp) const;
-
-    /// Utility function to emit string processing sse4.2 instructions
-    /// that return in xmm0.
-    /// This takes the instruction to expand, the associated machine basic
-    /// block, the number of args, and whether or not the second arg is
-    /// in memory or not.
-    MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
-                                unsigned argNum, bool inMem) const;
-
-    /// Utility functions to emit monitor and mwait instructions. These
-    /// need to make sure that the arguments to the intrinsic are in the
-    /// correct registers.
-    MachineBasicBlock *EmitMonitor(MachineInstr *MI,
-                                   MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
-
-    /// Utility function to emit atomic bitwise operations (and, or, xor).
-    /// It takes the bitwise instruction to expand, the associated machine basic
-    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
-    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpc,
-                                                    unsigned immOpc,
-                                                    unsigned loadOpc,
-                                                    unsigned cxchgOpc,
-                                                    unsigned notOpc,
-                                                    unsigned EAXreg,
-                                              const TargetRegisterClass *RC,
-                                                    bool Invert = false) const;
-
-    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpcL,
-                                                    unsigned regOpcH,
-                                                    unsigned immOpcL,
-                                                    unsigned immOpcH,
-                                                    bool Invert = false) const;
-
-    /// Utility function to emit atomic min and max.  It takes the min/max
-    /// instruction to expand, the associated basic block, and the associated
-    /// cmov opcode for moving the min or max value.
-    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
-                                                          MachineBasicBlock *BB,
-                                                        unsigned cmovOpc) const;
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, max, min, umax, umin). It takes the corresponding instruction to
+    /// expand, the associated machine basic block, and the associated X86
+    /// opcodes for reg/reg.
+    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) const;
 
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
@@ -938,6 +909,12 @@
     MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
@@ -955,6 +932,23 @@
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
   }
+
+  class X86VectorTargetTransformInfo : public VectorTargetTransformImpl {
+  public:
+    explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
+    VectorTargetTransformImpl(TL) {}
+
+    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
+    virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                        unsigned Index) const;
+
+    unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                Type *CondTy) const;
+
+    virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                      Type *Src) const;
+  };
 }
 
 #endif    // X86ISELLOWERING_H

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrCompiler.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrCompiler.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrCompiler.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrCompiler.td Tue Jan 15 11:16:16 2013
@@ -165,6 +165,33 @@
 
 }
 
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions used by segmented stacks.
 //
@@ -230,25 +257,18 @@
                         IIC_ALU_NONMEM>;
 
 // Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1 in {
 // FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 // However, Pat<> can't replicate the destination reg into the inputs of the
 // result.
-// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces
-// X86CodeEmitter.
-def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "",
-                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
-def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "",
-                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>,
-                OpSize;
-def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
-def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 } // isCodeGenOnly
 
 
@@ -453,6 +473,11 @@
                     "#CMOV_GR16* PSEUDO!",
                     [(set GR16:$dst,
                       (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+} // Predicates = [NoCMov]
+
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE1.
+let Predicates = [FPStackf32] in
 def CMOV_RFP32 : I<0, Pseudo,
                     (outs RFP32:$dst),
                     (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
@@ -460,6 +485,9 @@
                     [(set RFP32:$dst,
                       (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
                                                   EFLAGS))]>;
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE2.
+let Predicates = [FPStackf64] in
 def CMOV_RFP64 : I<0, Pseudo,
                     (outs RFP64:$dst),
                     (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
@@ -474,7 +502,6 @@
                     [(set RFP80:$dst,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
-} // Predicates = [NoCMov]
 } // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
@@ -482,130 +509,74 @@
 // Atomic Instruction Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-// Atomic exchange, and, or, xor
-let Constraints = "$val = $dst", Defs = [EFLAGS],
-                  usesCustomInserter = 1 in {
+// Pseudo atomic instructions
 
-def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
-def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
-def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMXOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
-def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMNAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
-
-def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
-def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
-def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMXOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
-def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMNAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
-def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
-               "#ATOMMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
-def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
-
-
-def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
-def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
-def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMXOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
-def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMNAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
-def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
-               "#ATOMMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
-def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
-
-
-
-def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
-def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
-def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMXOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
-def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMNAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
-def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
-               "#ATOMMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
-def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
-}
-
-let Constraints = "$val1 = $dst1, $val2 = $dst2",
-                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
-                  Uses = [EAX, EBX, ECX, EDX],
-                  mayLoad = 1, mayStore = 1,
-                  usesCustomInserter = 1 in {
-def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMAND6432 PSEUDO!", []>;
-def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMOR6432 PSEUDO!", []>;
-def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMXOR6432 PSEUDO!", []>;
-def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMNAND6432 PSEUDO!", []>;
-def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMADD6432 PSEUDO!", []>;
-def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSUB6432 PSEUDO!", []>;
-def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSWAP6432 PSEUDO!", []>;
+multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+    def #NAME#8  : I<0, Pseudo, (outs GR8:$dst),
+                     (ins i8mem:$ptr, GR8:$val),
+                     !strconcat(mnemonic, "8 PSEUDO!"), []>;
+    def #NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+                     (ins i16mem:$ptr, GR16:$val),
+                     !strconcat(mnemonic, "16 PSEUDO!"), []>;
+    def #NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+                     (ins i32mem:$ptr, GR32:$val),
+                     !strconcat(mnemonic, "32 PSEUDO!"), []>;
+    def #NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+                     (ins i64mem:$ptr, GR64:$val),
+                     !strconcat(mnemonic, "64 PSEUDO!"), []>;
+  }
+}
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
+  def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
+            (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
+            (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
+            (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
+            (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
 }
 
+// Atomic exchange, and, or, xor
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
+
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND",  "atomic_load_and">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR",   "atomic_load_or">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR",  "atomic_load_xor">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX",  "atomic_load_max">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN",  "atomic_load_min">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in
+    def #NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                       (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+                       !strconcat(mnemonic, "6432 PSEUDO!"), []>;
+}
+
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
+defm ATOMADD  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
+defm ATOMSUB  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
+defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -617,7 +588,6 @@
 // TODO: Get this to fold the constant into the instruction.
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
-                      "lock\n\t"
                       "or{l}\t{$zero, $dst|$dst, $zero}",
                       [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK;
 
@@ -637,72 +607,72 @@
 def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
                    MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
-                   !strconcat("lock\n\t", mnemonic, "{b}\t",
+                   !strconcat(mnemonic, "{b}\t",
                               "{$src2, $dst|$dst, $src2}"),
                    [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    !strconcat("lock\n\t", mnemonic, "{w}\t",
+                    !strconcat(mnemonic, "{w}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_NONMEM>, OpSize, LOCK;
 def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    !strconcat("lock\n\t", mnemonic, "{l}\t",
+                    !strconcat(mnemonic, "{l}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                      RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                      MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                     !strconcat("lock\n\t", mnemonic, "{q}\t",
+                     !strconcat(mnemonic, "{q}\t",
                                 "{$src2, $dst|$dst, $src2}"),
                      [], IIC_ALU_NONMEM>, LOCK;
 
 def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                      ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
-                     !strconcat("lock\n\t", mnemonic, "{b}\t",
+                     !strconcat(mnemonic, "{b}\t",
                                 "{$src2, $dst|$dst, $src2}"),
                      [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                       !strconcat(mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 
 def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                       !strconcat(mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
                        [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
-                          !strconcat("lock\n\t", mnemonic, "{q}\t",
+                          !strconcat(mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
                           [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                       !strconcat(mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                       !strconcat(mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
                        [], IIC_ALU_MEM>, LOCK;
 def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                         ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                         ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
-                        !strconcat("lock\n\t", mnemonic, "{q}\t",
+                        !strconcat(mnemonic, "{q}\t",
                                    "{$src2, $dst|$dst, $src2}"),
                         [], IIC_ALU_MEM>, LOCK;
 
@@ -717,106 +687,116 @@
 defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
-def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
-                     "lock\n\t"
-                     "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-
-def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
-                      "lock\n\t"
-                      "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
+def #NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                  !strconcat(mnemonic, "{b}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                  !strconcat(mnemonic, "{w}\t$dst"),
+                  [], IIC_UNARY_MEM>, OpSize, LOCK;
+def #NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                  !strconcat(mnemonic, "{l}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                   !strconcat(mnemonic, "{q}\t$dst"),
+                   [], IIC_UNARY_MEM>, LOCK;
+}
 }
 
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
 // Atomic compare and swap.
-let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
-               "lock\n\t"
-               "cmpxchg8b\t$ptr",
-               [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK;
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def #NAME# : I<Opc, Form, (outs), (ins x86memop:$ptr),
+                 !strconcat(mnemonic, "\t$ptr"),
+                 [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def #NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                   !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def #NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                   !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def #NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                   !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def #NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                    !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                    [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
+}
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
-                    "lock\n\t"
-                    "cmpxchg16b\t$ptr",
-                    [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK,
-                    Requires<[HasCmpxchg16b]>;
-
-let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
-def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
-               "lock\n\t"
-               "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK;
-}
-
-let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in {
-def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
-               "lock\n\t"
-               "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK;
-}
-
-let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in {
-def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
-               "lock\n\t"
-               "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK;
-}
-
-let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
-def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
-               "lock\n\t"
-               "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK;
+    Predicates = [HasCmpxchg16b] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
 }
 
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
 // Atomic exchange and add
-let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
-def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "lock\n\t"
-               "xadd{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))],
-                IIC_XADD_LOCK_MEM8>,
-                TB, LOCK;
-def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
-               "lock\n\t"
-               "xadd{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, OpSize, LOCK;
-def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
-               "lock\n\t"
-               "xadd{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
-def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
-               "lock\n\t"
-               "xadd{q}\t{$val, $ptr|$ptr, $val}",
-               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
-}
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
+    def #NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                     (ins GR8:$val, i8mem:$ptr),
+                     !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                     [(set GR8:$dst,
+                           (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                     itin8>;
+    def #NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                     (ins GR16:$val, i16mem:$ptr),
+                     !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR16:$dst,
+                        (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                     itin>, OpSize;
+    def #NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                     (ins GR32:$val, i32mem:$ptr),
+                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR32:$dst,
+                        (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                     itin>;
+    def #NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$val, i64mem:$ptr),
+                      !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                      itin>;
+  }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
 
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
                       "#ACQUIRE_MOV PSEUDO!",
@@ -1017,7 +997,24 @@
 def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
-// tailcall stuff
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In32BitMode]>;
@@ -1041,7 +1038,9 @@
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrControl.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrControl.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrControl.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrControl.td Tue Jan 15 11:16:16 2013
@@ -16,15 +16,18 @@
 //
 
 // Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFMA.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFMA.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFMA.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFMA.td Tue Jan 15 11:16:16 2013
@@ -16,243 +16,180 @@
 //===----------------------------------------------------------------------===//
 
 let Constraints = "$src1 = $dst" in {
-multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
-let neverHasSideEffects = 1 in {
-  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  let mayLoad = 1 in
-  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-                (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                !strconcat(OpcodeStr, 
-                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  let mayLoad = 1 in
-  def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-                (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                !strconcat(OpcodeStr, 
-                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-} // neverHasSideEffects = 1
-}
-
-// Intrinsic for 213 pattern
-multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
-                        PatFrag MemFrag128, PatFrag MemFrag256,
-                        Intrinsic Int128, Intrinsic Int256, SDNode Op213, 
-			                  ValueType OpVT128, ValueType OpVT256> {
-  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
-                                      VR128:$src3))]>;
-
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    SDPatternOperator Op = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, 
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
-                                      (MemFrag128 addr:$src3)))]>;
-
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, VR128:$src1,
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-
-  def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                    !strconcat(OpcodeStr, 
-                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
-                                       VR256:$src3))]>;
-
+  let isCommutable = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR256:$dst, (OpVT256 (Op213 VR256:$src2, VR256:$src1,
-                                               VR256:$src3)))]>;
-
-  def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                    !strconcat(OpcodeStr, 
-                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
-                                       (MemFrag256 addr:$src3)))]>;
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
 
+  let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR256:$dst,
-                     (OpVT256 (Op213 VR256:$src2, VR256:$src1, 
-                               (MemFrag256 addr:$src3))))]>;
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
 }
 } // Constraints = "$src1 = $dst"
 
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
-                       Intrinsic Int128, Intrinsic Int256, SDNode Op,
-                       ValueType OpTy128, ValueType OpTy256> {
-  defm r213 : fma3p_rm_int <opc213, !strconcat(OpcodeStr,
-                            !strconcat("213", PackTy)), MemFrag128, MemFrag256,
-                            Int128, Int256, Op, OpTy128, OpTy256>;
-  defm r132 : fma3p_rm <opc132, 
-                        !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
-  defm r231 : fma3p_rm <opc231, 
-                        !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  defm r213 : fma3p_rm<opc213,
+                       !strconcat(OpcodeStr, !strconcat("213", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3p_rm<opc132,
+                       !strconcat(OpcodeStr, !strconcat("132", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  defm r231 : fma3p_rm<opc231,
+                       !strconcat(OpcodeStr, !strconcat("231", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+} // neverHasSideEffects = 1
 }
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
-                                 memopv8f32, int_x86_fma_vfmadd_ps,
-                                 int_x86_fma_vfmadd_ps_256, X86Fmadd,
-                                 v4f32, v8f32>;
+                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
   defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
-                                 memopv8f32, int_x86_fma_vfmsub_ps, 
-                                 int_x86_fma_vfmsub_ps_256, X86Fmsub,
-                                 v4f32, v8f32>;
+                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
   defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
-                                 memopv4f32, memopv8f32, 
-                                 int_x86_fma_vfmaddsub_ps,
-                                 int_x86_fma_vfmaddsub_ps_256, X86Fmaddsub, 
+                                 memopv4f32, memopv8f32, X86Fmaddsub,
                                  v4f32, v8f32>;
   defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
-                                 memopv4f32, memopv8f32, 
-                                 int_x86_fma_vfmsubadd_ps,
-                                 int_x86_fma_vfmaddsub_ps_256, X86Fmsubadd,
+                                 memopv4f32, memopv8f32, X86Fmsubadd,
                                  v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
-                                 memopv4f64, int_x86_fma_vfmadd_pd,
-                                 int_x86_fma_vfmadd_pd_256, X86Fmadd, v2f64,
-                                 v4f64>, VEX_W;
+                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
   defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
-                                 memopv4f64, int_x86_fma_vfmsub_pd,
-                                 int_x86_fma_vfmsub_pd_256, X86Fmsub, v2f64,
-                                 v4f64>, VEX_W;
+                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
   defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
-                                 memopv2f64, memopv4f64,
-                                 int_x86_fma_vfmaddsub_pd,
-                                 int_x86_fma_vfmaddsub_pd_256, X86Fmaddsub,
+                                 memopv2f64, memopv4f64, X86Fmaddsub,
                                  v2f64, v4f64>, VEX_W;
   defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
-                                 memopv2f64, memopv4f64, 
-                                 int_x86_fma_vfmsubadd_pd,
-                                 int_x86_fma_vfmsubadd_pd_256, X86Fmsubadd,
+                                 memopv2f64, memopv4f64, X86Fmsubadd,
                                  v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
-                               memopv8f32, int_x86_fma_vfnmadd_ps,
-                               int_x86_fma_vfnmadd_ps_256, X86Fnmadd, v4f32,
-                               v8f32>;
+                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
   defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
-                               memopv8f32, int_x86_fma_vfnmsub_ps,
-                               int_x86_fma_vfnmsub_ps_256, X86Fnmsub, v4f32,
-                               v8f32>;
+                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
-                               memopv4f64, int_x86_fma_vfnmadd_pd,
-                               int_x86_fma_vfnmadd_pd_256, X86Fnmadd, v2f64,
-                               v4f64>, VEX_W;
+                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
   defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
-                               memopv2f64,
-                               memopv4f64, int_x86_fma_vfnmsub_pd,
-                               int_x86_fma_vfnmsub_pd_256, X86Fnmsub, v2f64,
+                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
                                v4f64>, VEX_W;
 }
 
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
-                    RegisterClass RC> {
-let neverHasSideEffects = 1 in {
-  def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2, RC:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    SDPatternOperator OpNode = null_frag> {
+  let isCommutable = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
   let mayLoad = 1 in
-  def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2, x86memop:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-} // neverHasSideEffects = 1
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
 }
 
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
-                        ComplexPattern mem_cpat, Intrinsic IntId, 
-                        RegisterClass RC, SDNode OpNode, ValueType OpVT> {
+                        ComplexPattern mem_cpat, Intrinsic IntId,
+                        RegisterClass RC> {
+  let isCommutable = 1 in
   def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
-                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1, 
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
                      VR128:$src3))]>;
   def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, memop:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                      (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
-  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, RC:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set RC:$dst,
-		                 (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
-  let mayLoad = 1 in
-  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, memop:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
 }
 } // Constraints = "$src1 = $dst"
 
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpStr, Intrinsic IntF32, Intrinsic IntF64, 
-                       SDNode OpNode> {
-  defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
-  defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
-  defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>,
-                        VEX_W;
-  defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>,
-                        VEX_W;
-  defm SSr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213ss"), ssmem,
-                              sse_load_f32, IntF32, FR32, OpNode, f32>;
-  defm SDr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213sd"), sdmem,
-                              sse_load_f64, IntF64, FR64, OpNode, f64>, VEX_W;
-}
-
-defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
-                          int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
-                          int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
-
-defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
-                           int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
-                           int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+                       string OpStr, string PackTy, Intrinsic Int,
+                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
+                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
+                       ComplexPattern mem_cpat> {
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, !strconcat("132", PackTy)),
+                       x86memop, RC, OpVT, mem_frag>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, !strconcat("231", PackTy)),
+                       x86memop, RC, OpVT, mem_frag>;
+}
+
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+                     x86memop, RC, OpVT, mem_frag, OpNode>,
+            fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+                         memop, mem_cpat, Int, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode,
+                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode,
+                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
 
 
 //===----------------------------------------------------------------------===//
@@ -260,73 +197,102 @@
 //===----------------------------------------------------------------------===//
 
 
-multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop,
-                 ComplexPattern mem_cpat, Intrinsic Int> {
-  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
-  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, memop:$src3),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4;
-  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, memop:$src2, VR128:$src3),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in
-  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
 }
 
-multiclass fma4p<bits<8> opc, string OpcodeStr,
-                 Intrinsic Int128, Intrinsic Int256,
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
   def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, f128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2,
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
                               (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, f128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
   def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4;
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4, VEX_L;
   def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2,
-                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
   def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst,
-             (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
 // For disassembler
 let isCodeGenOnly = 1 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
@@ -336,51 +302,65 @@
   def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
-                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
 } // isCodeGenOnly = 1
 }
 
 let Predicates = [HasFMA4] in {
 
-defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
-                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
-                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
-                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
-                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
-                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
-                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
-                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
-                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
-                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
-                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
-                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
-                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
 } // HasFMA4
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFormats.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFormats.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFormats.td Tue Jan 15 11:16:16 2013
@@ -44,14 +44,15 @@
 def MRM_D0 : Format<45>;
 def MRM_D1 : Format<46>;
 def MRM_D4 : Format<47>;
-def MRM_D8 : Format<48>;
-def MRM_D9 : Format<49>;
-def MRM_DA : Format<50>;
-def MRM_DB : Format<51>;
-def MRM_DC : Format<52>;
-def MRM_DD : Format<53>;
-def MRM_DE : Format<54>;
-def MRM_DF : Format<55>;
+def MRM_D5 : Format<48>;
+def MRM_D8 : Format<49>;
+def MRM_D9 : Format<50>;
+def MRM_DA : Format<51>;
+def MRM_DB : Format<52>;
+def MRM_DC : Format<53>;
+def MRM_DD : Format<54>;
+def MRM_DE : Format<55>;
+def MRM_DF : Format<56>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -287,12 +288,14 @@
   let CodeSize = 3;
 }
 
+def __xs : XS;
+
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -303,7 +306,7 @@
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -314,18 +317,25 @@
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
@@ -341,18 +351,18 @@
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -372,27 +382,31 @@
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
 //   VSDI   - SSE2 instructions with XD prefix in AVX form.
 //   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
@@ -405,6 +419,12 @@
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -415,21 +435,23 @@
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 
 
 // SSSE3 Instruction Templates:
 // 
 //   SS38I - SSSE3 instructions with T8 prefix.
 //   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
 //
 // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
 // uses the MMX registers. The 64-bit versions are grouped with the MMX
@@ -438,10 +460,18 @@
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSSE3]>;
+        Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -452,11 +482,11 @@
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
@@ -464,9 +494,10 @@
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
@@ -475,7 +506,7 @@
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrFragmentsSIMD.td Tue Jan 15 11:16:16 2013
@@ -29,6 +29,13 @@
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
 def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
@@ -73,14 +80,31 @@
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
 
 def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, 
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisOpSmallerThanOp<1, 0> ]>>;
 
 def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
+                 SDTypeProfile<1, 1,
+                 [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
 
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vfpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
 def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
 def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
@@ -170,8 +194,8 @@
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
 def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -227,6 +251,10 @@
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.cpp Tue Jan 15 11:16:16 2013
@@ -414,12 +414,6 @@
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
     { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
     { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
     { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
     { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
     { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
@@ -567,6 +561,16 @@
     { X86::VSQRTPSYr_Int,   X86::VSQRTPSYm_Int,       TB_ALIGN_32 },
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+
+    // BMI/BMI2 foldable instructions
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -680,6 +684,12 @@
     { X86::IMUL64rr,        X86::IMUL64rm,      0 },
     { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
     { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
     { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
     { X86::MAXPDrr_Int,     X86::MAXPDrm_Int,   TB_ALIGN_16 },
     { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
@@ -1110,6 +1120,44 @@
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      TB_ALIGN_32 },
     { X86::VPXORYrr,          X86::VPXORYrm,           TB_ALIGN_32 },
     // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1145,10 +1193,6 @@
     { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
     { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
-    { X86::VFMADDPSr213r_Int,     X86::VFMADDPSr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMADDPDr213r_Int,     X86::VFMADDPDr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMADDPSr213rY_Int,    X86::VFMADDPSr213mY_Int,    TB_ALIGN_32 },
-    { X86::VFMADDPDr213rY_Int,    X86::VFMADDPDr213mY_Int,    TB_ALIGN_32 },
 
     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
@@ -1171,10 +1215,6 @@
     { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
     { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPSr213r_Int,    X86::VFNMADDPSr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMADDPDr213r_Int,    X86::VFNMADDPDr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMADDPSr213rY_Int,   X86::VFNMADDPSr213mY_Int,   TB_ALIGN_32 },
-    { X86::VFNMADDPDr213rY_Int,   X86::VFNMADDPDr213mY_Int,   TB_ALIGN_32 },
 
     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
@@ -1197,10 +1237,6 @@
     { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
     { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPSr213r_Int,     X86::VFMSUBPSr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMSUBPDr213r_Int,     X86::VFMSUBPDr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMSUBPSr213rY_Int,    X86::VFMSUBPSr213mY_Int,    TB_ALIGN_32 },
-    { X86::VFMSUBPDr213rY_Int,    X86::VFMSUBPDr213mY_Int,    TB_ALIGN_32 },
 
     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
@@ -1223,10 +1259,6 @@
     { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
     { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
     { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPSr213r_Int,    X86::VFNMSUBPSr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMSUBPDr213r_Int,    X86::VFNMSUBPDr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMSUBPSr213rY_Int,   X86::VFNMSUBPSr213mY_Int,   TB_ALIGN_32 },
-    { X86::VFNMSUBPDr213rY_Int,   X86::VFNMSUBPDr213mY_Int,   TB_ALIGN_32 },
 
     { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
     { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
@@ -1240,10 +1272,6 @@
     { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
     { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
     { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPSr213r_Int,  X86::VFMADDSUBPSr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMADDSUBPDr213r_Int,  X86::VFMADDSUBPDr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMADDSUBPSr213rY_Int, X86::VFMADDSUBPSr213mY_Int, TB_ALIGN_32 },
-    { X86::VFMADDSUBPDr213rY_Int, X86::VFMADDSUBPDr213mY_Int, TB_ALIGN_32 },
 
     { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
     { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
@@ -1257,10 +1285,40 @@
     { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPSr213r_Int,  X86::VFMSUBADDPSr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMSUBADDPDr213r_Int,  X86::VFMSUBADDPDr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMSUBADDPSr213rY_Int, X86::VFMSUBADDPSr213mY_Int, TB_ALIGN_32 },
-    { X86::VFMSUBADDPDr213rY_Int, X86::VFMSUBADDPDr213mY_Int, TB_ALIGN_32 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1318,8 +1376,7 @@
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
     switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable(0);
+    default: llvm_unreachable("Unreachable!");
     case X86::MOVSX16rr8:
     case X86::MOVZX16rr8:
     case X86::MOVSX32rr8:
@@ -1483,69 +1540,69 @@
                                                 AliasAnalysis *AA) const {
   switch (MI->getOpcode()) {
   default: break;
-    case X86::MOV8rm:
-    case X86::MOV16rm:
-    case X86::MOV32rm:
-    case X86::MOV64rm:
-    case X86::LD_Fp64m:
-    case X86::MOVSSrm:
-    case X86::MOVSDrm:
-    case X86::MOVAPSrm:
-    case X86::MOVUPSrm:
-    case X86::MOVAPDrm:
-    case X86::MOVDQArm:
-    case X86::VMOVSSrm:
-    case X86::VMOVSDrm:
-    case X86::VMOVAPSrm:
-    case X86::VMOVUPSrm:
-    case X86::VMOVAPDrm:
-    case X86::VMOVDQArm:
-    case X86::VMOVAPSYrm:
-    case X86::VMOVUPSYrm:
-    case X86::VMOVAPDYrm:
-    case X86::VMOVDQAYrm:
-    case X86::MMX_MOVD64rm:
-    case X86::MMX_MOVQ64rm:
-    case X86::FsVMOVAPSrm:
-    case X86::FsVMOVAPDrm:
-    case X86::FsMOVAPSrm:
-    case X86::FsMOVAPDrm: {
-      // Loads from constant pools are trivially rematerializable.
-      if (MI->getOperand(1).isReg() &&
-          MI->getOperand(2).isImm() &&
-          MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-          MI->isInvariantLoad(AA)) {
-        unsigned BaseReg = MI->getOperand(1).getReg();
-        if (BaseReg == 0 || BaseReg == X86::RIP)
-          return true;
-        // Allow re-materialization of PIC load.
-        if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
-          return false;
-        const MachineFunction &MF = *MI->getParent()->getParent();
-        const MachineRegisterInfo &MRI = MF.getRegInfo();
-        return regIsPICBase(BaseReg, MRI);
-      }
-      return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI->getOperand(1).isReg() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        MI->isInvariantLoad(AA)) {
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
     }
+    return false;
+  }
 
-     case X86::LEA32r:
-     case X86::LEA64r: {
-       if (MI->getOperand(2).isImm() &&
-           MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-           !MI->getOperand(4).isReg()) {
-         // lea fi#, lea GV, etc. are all rematerializable.
-         if (!MI->getOperand(1).isReg())
-           return true;
-         unsigned BaseReg = MI->getOperand(1).getReg();
-         if (BaseReg == 0)
-           return true;
-         // Allow re-materialization of lea PICBase + x.
-         const MachineFunction &MF = *MI->getParent()->getParent();
-         const MachineRegisterInfo &MRI = MF.getRegInfo();
-         return regIsPICBase(BaseReg, MRI);
-       }
-       return false;
-     }
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        !MI->getOperand(4).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI->getOperand(1).isReg())
+        return true;
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
   }
 
   // All other instructions marked M_REMATERIALIZABLE are always trivially
@@ -1654,7 +1711,7 @@
   case X86::MOV64r0: {
     if (!isSafeToClobberEFLAGS(MBB, I)) {
       switch (Opc) {
-      default: break;
+      default: llvm_unreachable("Unreachable!");
       case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
       case X86::MOV16r0: Opc = X86::MOV16ri; break;
       case X86::MOV32r0: Opc = X86::MOV32ri; break;
@@ -1727,8 +1784,7 @@
   MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
                                     get(Opc), leaOutReg);
   switch (MIOpc) {
-  default:
-    llvm_unreachable(0);
+  default: llvm_unreachable("Unreachable!");
   case X86::SHL16ri: {
     unsigned ShAmt = MI->getOperand(2).getImm();
     MIB.addReg(0).addImm(1 << ShAmt)
@@ -1812,10 +1868,8 @@
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-  bool isDead = MI->getOperand(0).isDead();
-  bool isKill = MI->getOperand(1).isKill();
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
 
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -1833,11 +1887,9 @@
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHUFPDrri: {
@@ -1847,15 +1899,13 @@
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
     M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHL64ri: {
@@ -1866,15 +1916,14 @@
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle RSP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
       return 0;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
@@ -1885,15 +1934,15 @@
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR32_NOSPRegClass))
       return 0;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL16ri: {
@@ -1906,10 +1955,8 @@
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   default: {
@@ -1932,14 +1979,12 @@
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     }
     case X86::INC16r:
@@ -1947,10 +1992,8 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     case X86::DEC64r:
     case X86::DEC32r:
@@ -1962,14 +2005,12 @@
         (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     }
     case X86::DEC16r:
@@ -1977,10 +2018,8 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     case X86::ADD64rr:
     case X86::ADD64rr_DB:
@@ -2007,9 +2046,8 @@
         return 0;
 
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
 
       // Preserve undefness of the operands.
       bool isUndef = MI->getOperand(1).isUndef();
@@ -2029,9 +2067,15 @@
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2041,10 +2085,9 @@
     case X86::ADD64ri32_DB:
     case X86::ADD64ri8_DB:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
     case X86::ADD32ri8:
@@ -2052,10 +2095,9 @@
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                                Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -2065,10 +2107,9 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
   }
@@ -2077,10 +2118,10 @@
   if (!NewMI) return 0;
 
   if (LV) {  // Update live variables
-    if (isKill)
-      LV->replaceKillInstruction(Src, MI, NewMI);
-    if (isDead)
-      LV->replaceKillInstruction(Dest, MI, NewMI);
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
   }
 
   MFI->insert(MBBI, NewMI);          // Insert the new inst
@@ -2120,57 +2161,25 @@
     MI->getOperand(3).setImm(Size-Amt);
     return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
   }
-  case X86::CMOVB16rr:
-  case X86::CMOVB32rr:
-  case X86::CMOVB64rr:
-  case X86::CMOVAE16rr:
-  case X86::CMOVAE32rr:
-  case X86::CMOVAE64rr:
-  case X86::CMOVE16rr:
-  case X86::CMOVE32rr:
-  case X86::CMOVE64rr:
-  case X86::CMOVNE16rr:
-  case X86::CMOVNE32rr:
-  case X86::CMOVNE64rr:
-  case X86::CMOVBE16rr:
-  case X86::CMOVBE32rr:
-  case X86::CMOVBE64rr:
-  case X86::CMOVA16rr:
-  case X86::CMOVA32rr:
-  case X86::CMOVA64rr:
-  case X86::CMOVL16rr:
-  case X86::CMOVL32rr:
-  case X86::CMOVL64rr:
-  case X86::CMOVGE16rr:
-  case X86::CMOVGE32rr:
-  case X86::CMOVGE64rr:
-  case X86::CMOVLE16rr:
-  case X86::CMOVLE32rr:
-  case X86::CMOVLE64rr:
-  case X86::CMOVG16rr:
-  case X86::CMOVG32rr:
-  case X86::CMOVG64rr:
-  case X86::CMOVS16rr:
-  case X86::CMOVS32rr:
-  case X86::CMOVS64rr:
-  case X86::CMOVNS16rr:
-  case X86::CMOVNS32rr:
-  case X86::CMOVNS64rr:
-  case X86::CMOVP16rr:
-  case X86::CMOVP32rr:
-  case X86::CMOVP64rr:
-  case X86::CMOVNP16rr:
-  case X86::CMOVNP32rr:
-  case X86::CMOVNP64rr:
-  case X86::CMOVO16rr:
-  case X86::CMOVO32rr:
-  case X86::CMOVO64rr:
-  case X86::CMOVNO16rr:
-  case X86::CMOVNO32rr:
-  case X86::CMOVNO64rr: {
-    unsigned Opc = 0;
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
     switch (MI->getOpcode()) {
-    default: break;
+    default: llvm_unreachable("Unreachable!");
     case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
     case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
     case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
@@ -2279,7 +2288,7 @@
 }
 
 /// getCondFromCmovOpc - return condition code of a CMov opcode.
-static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
@@ -2402,7 +2411,7 @@
 /// whether it has memory operand.
 static unsigned getSETFromCond(X86::CondCode CC,
                                bool HasMemoryOperand) {
-  static const unsigned Opc[16][2] = {
+  static const uint16_t Opc[16][2] = {
     { X86::SETAr,  X86::SETAm  },
     { X86::SETAEr, X86::SETAEm },
     { X86::SETBr,  X86::SETBm  },
@@ -2429,7 +2438,7 @@
 /// register size in bytes, and operand type.
 static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
                                 bool HasMemoryOperand) {
-  static const unsigned Opc[32][3] = {
+  static const uint16_t Opc[32][3] = {
     { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
     { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
     { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
@@ -2762,19 +2771,18 @@
   // SrcReg(GR64)  -> DestReg(VR64)
 
   if (X86::GR64RegClass.contains(DestReg)) {
-    if (X86::VR128RegClass.contains(SrcReg)) {
+    if (X86::VR128RegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
       return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
-    } else if (X86::VR64RegClass.contains(SrcReg)) {
+    if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
-    }
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128RegClass.contains(DestReg))
       return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
-    else if (X86::VR64RegClass.contains(DestReg))
+    if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
   }
 
@@ -2782,12 +2790,12 @@
   // SrcReg(GR32) -> DestReg(FR32)
 
   if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
-      // Copy from a FR32 register to a GR32 register.
-      return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
 
   if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
-      // Copy from a GR32 register to a FR32 register.
-      return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
 
   return 0;
 }
@@ -2798,7 +2806,7 @@
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  unsigned Opc = 0;
+  unsigned Opc;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2837,7 +2845,8 @@
       BuildMI(MBB, MI, DL, get(X86::PUSHF64));
       BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
       return;
-    } else if (X86::GR32RegClass.contains(DestReg)) {
+    }
+    if (X86::GR32RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF32));
       BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
       return;
@@ -2849,7 +2858,8 @@
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF64));
       return;
-    } else if (X86::GR32RegClass.contains(SrcReg)) {
+    }
+    if (X86::GR32RegClass.contains(SrcReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSH32r))
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF32));
@@ -3139,11 +3149,19 @@
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:  case X86::DEC32r:  case X86::DEC16r: case X86::DEC8r:
+  case X86::DEC64m:  case X86::DEC32m:  case X86::DEC16m: case X86::DEC8m:
+  case X86::DEC64_32r: case X86::DEC64_16r:
+  case X86::DEC64_32m: case X86::DEC64_16m:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:  case X86::INC32r:  case X86::INC16r: case X86::INC8r:
+  case X86::INC64m:  case X86::INC32m:  case X86::INC16m: case X86::INC8m:
+  case X86::INC64_32r: case X86::INC64_16r:
+  case X86::INC64_32m: case X86::INC64_16m:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
@@ -3193,7 +3211,7 @@
       return false;
     // There is no use of the destination register, we can replace SUB with CMP.
     switch (CmpInstr->getOpcode()) {
-    default: llvm_unreachable(0);
+    default: llvm_unreachable("Unreachable!");
     case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
     case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
     case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
@@ -3318,7 +3336,7 @@
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
-          OldCC = getCondFromCMovOpc(Instr.getOpcode());
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
       }
       if (OldCC == X86::COND_INVALID) return false;
     }
@@ -3383,12 +3401,14 @@
     Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
   }
 
-  // Make sure Sub instruction defines EFLAGS.
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned LastOperand = Sub->getNumOperands() - 1;
   assert(Sub->getNumOperands() >= 2 &&
-         Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
-         Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+         Sub->getOperand(LastOperand).isReg() &&
+         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
          "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
-  Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDead(false);
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
@@ -3497,10 +3517,25 @@
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (MI->getOpcode()) {
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MI, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MI, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MI, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MI, get(X86::SBB64rr));
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
     return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MI, get(X86::VXORPSYrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
@@ -3614,14 +3649,16 @@
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    if (MI->getOpcode() == X86::MOV64r0)
-      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV32r0)
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV16r0)
-      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV8r0)
-      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
+    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
+    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    }
+    if (Opc)
+       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
     if (NewMI)
       return NewMI;
 
@@ -3799,7 +3836,8 @@
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return 0;
 
@@ -3840,7 +3878,8 @@
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return 0;
 
@@ -3850,15 +3889,12 @@
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::AVX_SET0PSY:
-    case X86::AVX_SET0PDY:
     case X86::AVX2_SETALLONES:
-    case X86::AVX2_SET0:
+    case X86::AVX_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -3894,11 +3930,8 @@
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PSY:
-  case X86::AVX_SET0PDY:
-  case X86::AVX_SETALLONES:
   case X86::AVX2_SETALLONES:
-  case X86::AVX2_SET0:
+  case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -3930,15 +3963,12 @@
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
-    else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
-      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
-    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0)
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES ||
-                      Opc == X86::AVX2_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -4013,6 +4043,8 @@
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
@@ -4102,7 +4134,6 @@
                getUndefRegState(MO.isUndef()));
   }
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
-  unsigned NewOpc = 0;
   switch (DataMI->getOpcode()) {
   default: break;
   case X86::CMP64ri32:
@@ -4115,8 +4146,9 @@
     MachineOperand &MO0 = DataMI->getOperand(0);
     MachineOperand &MO1 = DataMI->getOperand(1);
     if (MO1.getImm() == 0) {
+      unsigned NewOpc;
       switch (DataMI->getOpcode()) {
-      default: break;
+      default: llvm_unreachable("Unreachable!");
       case X86::CMP64ri8:
       case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
       case X86::CMP32ri8:

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.h Tue Jan 15 11:16:16 2013
@@ -61,6 +61,9 @@
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrInfo.td Tue Jan 15 11:16:16 2013
@@ -114,7 +114,7 @@
 def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
-                            [SDNPHasChain]>;
+                            [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
                         [SDNPHasChain]>;
 def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
@@ -216,6 +216,14 @@
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
 def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
@@ -397,7 +405,7 @@
 
 let OperandType = "OPERAND_PCREL",
     ParserMatchClass = X86AbsMemAsmOperand,
-    PrintMethod = "print_pcrel_imm" in {
+    PrintMethod = "printPCRelImm" in {
 def i32imm_pcrel : Operand<i32>;
 def i16imm_pcrel : Operand<i16>;
 
@@ -418,7 +426,7 @@
 }
 
 def AVXCC : Operand<i8> {
-  let PrintMethod = "printSSECC";
+  let PrintMethod = "printAVXCC";
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
@@ -499,7 +507,7 @@
 // 64-bits but only 32 bits are significant, and those bits are treated as being
 // pc relative.
 def i64i32imm_pcrel : Operand<i64> {
-  let PrintMethod = "print_pcrel_imm";
+  let PrintMethod = "printPCRelImm";
   let ParserMatchClass = X86AbsMemAsmOperand;
   let OperandType = "OPERAND_PCREL";
 }
@@ -552,14 +560,21 @@
 def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
 def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
 def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
 def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
 def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -574,6 +589,7 @@
 def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -1259,28 +1275,46 @@
 // Atomic support
 //
 
-
 // Atomic swap. These are just normal xchg instructions. But since a memory
 // operand is referenced, the atomicity is ensured.
-let Constraints = "$val = $dst" in {
-def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "xchg{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
-               IIC_XCHG_MEM>;
-def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
-               "xchg{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
-               IIC_XCHG_MEM>,
-                OpSize;
-def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
-               "xchg{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
-               IIC_XCHG_MEM>;
-def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
-                  "xchg{q}\t{$val, $ptr|$ptr, $val}",
-                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
-                  IIC_XCHG_MEM>;
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst" in {
+    def #NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                       (ins GR8:$val, i8mem:$ptr),
+                       !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                       itin>;
+    def #NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                       (ins GR16:$val, i16mem:$ptr),
+                       !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR16:$dst,
+                          (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                       itin>, OpSize;
+    def #NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                       (ins GR32:$val, i32mem:$ptr),
+                       !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR32:$dst,
+                          (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                       itin>;
+    def #NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                        (ins GR64:$val, i64mem:$ptr),
+                        !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                        [(set
+                          GR64:$dst,
+                          (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                        itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
 
+// Swap between registers.
+let Constraints = "$val = $dst" in {
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
                 "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
@@ -1291,6 +1325,7 @@
                   "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
+// Swap between EAX and other registers.
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
                   "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
@@ -1672,6 +1707,8 @@
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
 
+include "X86InstrTSX.td"
+
 // System instructions.
 include "X86InstrSystem.td"
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrMMX.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrMMX.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrMMX.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrMMX.td Tue Jan 15 11:16:16 2013
@@ -118,11 +118,11 @@
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId64, OpndItins itins> {
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
                      (IntId64 (bitconvert (memopmmx addr:$src))))],
@@ -134,11 +134,11 @@
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
@@ -149,11 +149,11 @@
 
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64irr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
-  def R64irm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
@@ -163,12 +163,10 @@
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, OpndItins itins, Domain d> {
-  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        itins.rr, d>;
-  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        itins.rm, d>;
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -209,8 +207,14 @@
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
-def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>;
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
@@ -243,29 +247,30 @@
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
 
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                          (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
-                          [(set VR64:$dst,
-                            (x86mmx (bitconvert
-                            (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))],
-                          IIC_MMX_MOVQ_RR>;
-
-def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
-          [(set VR128:$dst,
-            (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))],
-                           IIC_MMX_MOVQ_RR>;
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
-
-def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
@@ -554,20 +559,6 @@
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
 
-// MMX to XMM for vector types
-def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
-          (v2i64 (MOVQI2PQIrm addr:$src))>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq 
-                    (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
-          (v2i64 (MOVDI2PDIrm addr:$src))>;
-
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -591,6 +582,7 @@
                            IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
+let Predicates = [HasSSE2] in {
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
@@ -599,5 +591,6 @@
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+}
 
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrSSE.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrSSE.td Tue Jan 15 11:16:16 2013
@@ -251,35 +251,37 @@
 
 // A 128-bit subvector extract from the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
 
-def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
 
-def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
           (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
           (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
 
 // A 128-bit subvector insert to the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
@@ -362,7 +364,7 @@
   def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
 }
 
-// Alias instructions that map fld0 to pxor for sse.
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1 in {
@@ -382,11 +384,11 @@
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, neverHasSideEffects = 1 in {
-def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+    isPseudo = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
@@ -394,35 +396,29 @@
 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 
 
-// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
 // and doesn't need it because on sandy bridge the register is set to zero
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-let Predicates = [HasAVX] in {
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-}
-let Predicates = [HasAVX2], neverHasSideEffects = 1 in
-def AVX2_SET0   : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   []>, VEX_4V;
+    isPseudo = 1, Predicates = [HasAVX] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 5 in {
-  def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
 }
 
-// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
 // VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
@@ -438,22 +434,17 @@
 def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
 
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
-  let Predicates = [HasAVX] in
-  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+    isPseudo = 1 in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
   let Predicates = [HasAVX2] in
-  def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
-                          [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
 
@@ -605,27 +596,27 @@
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0),
                            (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
                            sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0),
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
@@ -704,7 +695,7 @@
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
@@ -738,7 +729,7 @@
             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSD to the lower bits.
@@ -822,16 +813,16 @@
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                               TB;
@@ -864,19 +855,19 @@
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 
 // For disassembler
 let isCodeGenOnly = 1 in {
@@ -899,33 +890,33 @@
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86vzmovl
-                        (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4i64 (X86vzmovl
-                        (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v8f32 (X86vzmovl
-                        (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4f64 (X86vzmovl
-                        (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 }
 
@@ -975,10 +966,10 @@
             (VMOVUPDmr addr:$dst, VR128:$src)>;
 }
 
-let Predicates = [HasSSE1] in
+let Predicates = [UseSSE1] in
   def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
             (MOVUPSmr addr:$dst, VR128:$src)>;
-let Predicates = [HasSSE2] in
+let Predicates = [UseSSE2] in
   def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
             (MOVUPDmr addr:$dst, VR128:$src)>;
 
@@ -1028,12 +1019,52 @@
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 // The instructions selected below are then converted to MOVDQA/MOVDQU
 // during the SSE domain pass.
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
@@ -1180,7 +1211,7 @@
             (VMOVLPDmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
@@ -1205,7 +1236,7 @@
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Shuffle with MOVLPD
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1271,7 +1302,7 @@
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
 
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1279,7 +1310,7 @@
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVHPS patterns
   def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1289,9 +1320,9 @@
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1346,7 +1377,7 @@
             (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (MOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1456,7 +1487,7 @@
 def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1628,12 +1659,12 @@
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               TB, VEX, Requires<[HasAVX]>;
+                               TB, VEX, VEX_L, Requires<[HasAVX]>;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB, Requires<[HasSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 
 /// SSE 2 Only
 
@@ -1663,7 +1694,7 @@
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))],
                       IIC_SSE_CVT_Scalar_RM>,
                       XD,
-                  Requires<[HasSSE2, OptForSize]>;
+                  Requires<[UseSSE2, OptForSize]>;
 
 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1684,13 +1715,13 @@
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
 }
 
 // Convert scalar single to scalar double
@@ -1709,30 +1740,28 @@
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
 }
 
-let AddedComplexity = 1 in { // give AVX priority
-  def : Pat<(f64 (fextend FR32:$src)),
-            (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(fextend (loadf32 addr:$src)),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
-
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-            Requires<[HasAVX, OptForSpeed]>;
-} // AddedComplexity = 1
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (fextend FR32:$src))],
                    IIC_SSE_CVT_Scalar_RR>, XS,
-                 Requires<[HasSSE2]>;
+                 Requires<[UseSSE2]>;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (extloadf32 addr:$src))],
                    IIC_SSE_CVT_Scalar_RM>, XS,
-                 Requires<[HasSSE2, OptForSize]>;
+                 Requires<[UseSSE2, OptForSize]>;
 
 // extload f32 -> f64.  This matches load+fextend because we have a hack in
 // the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1740,9 +1769,9 @@
 // Since these loads aren't folded into the fextend, we have to match it
 // explicitly here.
 def : Pat<(fextend (loadf32 addr:$src)),
-          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
 def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1762,13 +1791,13 @@
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
 }
 
 // Convert packed single/double fp to doubleword
@@ -1785,12 +1814,12 @@
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -1824,7 +1853,7 @@
 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX;
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -1860,12 +1889,12 @@
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
-                          IIC_SSE_CVT_PS_RR>, VEX;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
                                              (memopv8f32 addr:$src)))],
-                          IIC_SSE_CVT_PS_RM>, VEX;
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1904,7 +1933,7 @@
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -1945,7 +1974,7 @@
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
-                         IIC_SSE_CVT_PD_RR>, VEX;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
@@ -1978,31 +2007,31 @@
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, TB, VEX;
+                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, TB;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>, TB;
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, TB;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2019,11 +2048,11 @@
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX;
+                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX;
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L;
 }
 
 let neverHasSideEffects = 1, mayLoad = 1 in
@@ -2066,7 +2095,7 @@
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -2096,24 +2125,41 @@
             (VCVTDQ2PSYrm addr:$src)>;
 
   // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (VCVTPD2PSXrm addr:$src)>;
   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
             (VCVTPD2PSYrr VR256:$src)>;
   def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
             (VCVTPD2PSYrm addr:$src)>;
 
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (VCVTPS2PDrr VR128:$src)>;
   def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
             (VCVTPS2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
             (VCVTPS2PDYrm addr:$src)>;
 }
 
+let Predicates = [UseSSE2] in {
+  // Match fround and fextend for 128 conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (CVTPD2PSrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (CVTPS2PDrr VR128:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
 //===----------------------------------------------------------------------===//
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, SDNode OpNode, ValueType VT, 
+                            Operand CC, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm, string asm_alt,
                             OpndItins itins> {
   def rr : SIi8<0xC2, MRMSrcReg,
@@ -2259,7 +2305,7 @@
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm, 
+                            Operand CC, Intrinsic Int, string asm,
                             string asm_alt, Domain d> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
@@ -2292,11 +2338,11 @@
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, TB, VEX_4V;
+               SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, TB, OpSize, VEX_4V;
+               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
@@ -2328,14 +2374,14 @@
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
@@ -2366,13 +2412,13 @@
            memopv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
            memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2412,7 +2458,7 @@
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
                        (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
             (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2420,7 +2466,7 @@
             (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Generic SHUFPD patterns
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
                        (memopv2i64 addr:$src2), (i8 imm:$imm))),
@@ -2466,16 +2512,16 @@
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2492,7 +2538,27 @@
                        SSEPackedDouble>, TB, OpSize;
 } // Constraints = "$src1 = $dst"
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2501,7 +2567,7 @@
             (VUNPCKLPDrr VR128:$src, VR128:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2532,10 +2598,11 @@
                                         "movmskpd", SSEPackedDouble>, TB,
                                         OpSize, VEX;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, TB, VEX;
+                                        "movmskps", SSEPackedSingle>, TB,
+                                        VEX, VEX_L;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
                                         "movmskpd", SSEPackedDouble>, TB,
-                                        OpSize, VEX;
+                                        OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
             (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -2556,11 +2623,11 @@
              OpSize, VEX;
   def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX;
+             SSEPackedSingle>, TB, VEX, VEX_L;
   def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
              SSEPackedDouble>, TB,
-             OpSize, VEX;
+             OpSize, VEX, VEX_L;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2570,16 +2637,16 @@
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -2638,13 +2705,13 @@
 
 let Predicates = [HasAVX2] in {
 defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
-                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2675,14 +2742,12 @@
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-let mayLoad = 0 in {
-  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-                SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-                SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-                SSE_BIT_ITINS_P>;
-}
+defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+              SSE_BIT_ITINS_P>;
+defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+              SSE_BIT_ITINS_P>;
+defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+              SSE_BIT_ITINS_P>;
 
 let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
   defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
@@ -2732,7 +2797,7 @@
           !strconcat(OpcodeStr, "ps"), f256mem,
           [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
+                             (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2740,7 +2805,7 @@
                                     (bc_v4i64 (v4f64 VR256:$src2))))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                     (memopv4i64 addr:$src2)))], 0>,
-                                    TB, OpSize, VEX_4V;
+                                    TB, OpSize, VEX_4V, VEX_L;
 }
 
 // AVX 256-bit packed logical ops forms
@@ -2786,27 +2851,23 @@
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    SizeItins itins,
                                    bit Is2Addr = 1> {
-  let mayLoad = 0 in {
   defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
               v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
               TB;
   defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
               v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
               TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode,
                                     SizeItins itins> {
-  let mayLoad = 0 in {
-    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
+  defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
                 v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
-                TB;
-    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
+                TB, VEX_L;
+  defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
                 v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
-                TB, OpSize;
-  }
+                TB, OpSize, VEX_L;
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2838,11 +2899,11 @@
                                         SizeItins itins> {
   defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
-      SSEPackedSingle, itins.s, 0>, TB;
+      SSEPackedSingle, itins.s, 0>, TB, VEX_L;
 
   defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
-      SSEPackedDouble, itins.d, 0>, TB, OpSize;
+      SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_L;
 }
 
 // Binary Arithmetic instructions
@@ -2864,7 +2925,8 @@
               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
                 VEX_4V, VEX_LIG;
   defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>,
-              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, VEX_4V;
+              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+                VEX_4V;
   defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
                 VEX_4V, VEX_LIG;
@@ -2915,6 +2977,23 @@
   }
 }
 
+let isCodeGenOnly = 1 in {
+  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
+       basic_sse12_fp_binop_p_y<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, VEX_4V;
+  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  defm VMINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P, 0>,
+       basic_sse12_fp_binop_p_y<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>,
+         basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
+    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>,
+         basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+  }
+}
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -2952,7 +3031,7 @@
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[HasSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>;
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
@@ -2966,7 +3045,7 @@
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
@@ -2974,6 +3053,7 @@
                 (ins VR128:$src1, ssmem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -2993,11 +3073,11 @@
   def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
@@ -3019,11 +3099,11 @@
   def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
@@ -3036,7 +3116,7 @@
   def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[HasSSE2, OptForSize]>;
+            Requires<[UseSSE2, OptForSize]>;
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
@@ -3046,20 +3126,20 @@
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
+let hasSideEffects = 0 in
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  }
   def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -3079,11 +3159,11 @@
   def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
@@ -3105,11 +3185,11 @@
   def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -3150,7 +3230,6 @@
                                     SSE_RCPP>, VEX;
 }
 
-let AddedComplexity = 1 in {
 def : Pat<(f32 (fsqrt FR32:$src)),
           (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
 def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -3173,9 +3252,8 @@
 def : Pat<(f32 (X86frcp (load addr:$src))),
           (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX, OptForSize]>;
-}
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3215,17 +3293,52 @@
              sse2_fp_unop_p<0x51, "sqrt",  fsqrt, SSE_SQRTS>,
              sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
 
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               Intrinsic F32Int, OpndItins itins> {
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>;
+  let Constraints = "$src1 = $dst" in {
+    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rr>;
+    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rm>;
+  }
+}
+
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
-                            SSE_SQRTS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
+                             SSE_SQRTS>,
              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
                             SSE_SQRTS>;
-defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
-                            SSE_RCPS>,
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+}
+
+defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
+                             SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
 
 // There is no f64 version of the reciprocal approximation instructions.
 
@@ -3263,20 +3376,20 @@
                        "movntps\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v8f32 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                        (ins f256mem:$dst, VR256:$src),
                        "movntpd\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v4f64 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   let ExeDomain = SSEPackedInt in
   def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
                       (ins f256mem:$dst, VR256:$src),
                       "movntdq\t{$src, $dst|$dst, $src}",
                       [(alignednontemporalstore (v4i64 VR256:$src),
                                                 addr:$dst)],
-                                                IIC_SSE_MOVNT>, VEX;
+                                                IIC_SSE_MOVNT>, VEX, VEX_L;
 }
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
@@ -3296,7 +3409,7 @@
                     IIC_SSE_MOVNT>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -3385,14 +3498,14 @@
                     VEX;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 }
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
                     VEX;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3402,16 +3515,14 @@
                         VEX;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
                         VEX;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3420,14 +3531,14 @@
                    VEX;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   VEX, VEX_L;
 let Predicates = [HasAVX] in {
   def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
                     XS, VEX;
   def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
+                    XS, VEX, VEX_L;
 }
 }
 
@@ -3439,14 +3550,14 @@
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     VEX, VEX_L;
 let Predicates = [HasAVX] in {
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
                   XS, VEX;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  XS, VEX, VEX_L;
 }
 }
 
@@ -3456,7 +3567,7 @@
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3466,7 +3577,7 @@
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3478,7 +3589,7 @@
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
                    IIC_SSE_MOVU_P_RM>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1 in {
@@ -3490,7 +3601,7 @@
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/],
                    IIC_SSE_MOVU_P_MR>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 // Intrinsic forms of MOVDQU load and store
@@ -3504,7 +3615,7 @@
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
                        IIC_SSE_MOVU_P_MR>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -3682,82 +3793,82 @@
 
 let Predicates = [HasAVX2] in {
 defm VPADDBY  : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDWY  : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDDY  : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDQY  : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSUBBY  : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBWY  : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, memopv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 
 // Intrinsic forms
 defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBSWY  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPADDSBY  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDSWY  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHWY  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_PMADD, 1, 0>, VEX_4V;
+                                  SSE_PMADD, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGBY   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGWY   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINUBY  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINSWY  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXUBY  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXSWY  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSADBWY  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3893,30 +4004,30 @@
 let Predicates = [HasAVX2] in {
 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 let ExeDomain = SSEPackedInt in {
   // 256-bit logical shifts.
@@ -3925,13 +4036,13 @@
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   def VPSRLDQYri : PDIi8<0x73, MRM3r,
                     (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
 }
 } // Predicates = [HasAVX2]
@@ -4002,7 +4113,7 @@
             (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
             (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
@@ -4045,22 +4156,22 @@
 let Predicates = [HasAVX2] in {
   defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4103,13 +4214,13 @@
 let Predicates = [HasAVX2] in {
 defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4179,12 +4290,15 @@
 }
 
 let Predicates = [HasAVX2] in {
-  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX;
-  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX;
-  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
+  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>,
+                                TB, OpSize, VEX,VEX_L;
+  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>,
+                                  XS, VEX, VEX_L;
+  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>,
+                                  XD, VEX, VEX_L;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
  let AddedComplexity = 5 in
   defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
 
@@ -4260,22 +4374,22 @@
 
 let Predicates = [HasAVX2] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4299,28 +4413,6 @@
 }
 } // ExeDomain = SSEPackedInt
 
-// Patterns for using AVX1 instructions with integer vectors
-// Here to give AVX2 priority
-let Predicates = [HasAVX] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -4369,7 +4461,7 @@
 }
 
 let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4389,9 +4481,9 @@
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX;
+           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
 def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
+           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
@@ -4530,7 +4622,7 @@
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4646,14 +4738,14 @@
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -4693,7 +4785,7 @@
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
                       IIC_SSE_MOVDQ>, XS,
-                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
@@ -4736,7 +4828,7 @@
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 let Predicates = [HasAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
@@ -4747,7 +4839,7 @@
             (VMOVZQI2PQIrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@@ -4777,7 +4869,7 @@
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4792,7 +4884,7 @@
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 }
 
 let AddedComplexity = 20 in {
@@ -4802,7 +4894,7 @@
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
-  let Predicates = [HasSSE2] in {
+  let Predicates = [UseSSE2] in {
     def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
               (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -4854,9 +4946,9 @@
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
                                        v4f32, VR128, memopv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4882,7 +4974,7 @@
             (VMOVSLDUPYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -4924,7 +5016,7 @@
 
 let Predicates = [HasAVX] in {
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -4951,7 +5043,7 @@
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
@@ -4973,7 +5065,8 @@
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX;
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5006,16 +5099,16 @@
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
                                  f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                                 f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
                                  f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                                 f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
   }
 }
-let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
                               f128mem, SSE_ALU_F32P>, TB, XD;
@@ -5067,9 +5160,9 @@
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
@@ -5077,9 +5170,9 @@
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -5145,11 +5238,11 @@
 
 let Predicates = [HasAVX2] in {
   defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
-                                    int_x86_avx2_pabs_b>, VEX;
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
   defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
-                                    int_x86_avx2_pabs_w>, VEX;
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
   defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
-                                    int_x86_avx2_pabs_d>, VEX;
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
 }
 
 defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
@@ -5288,37 +5381,37 @@
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                        int_x86_avx2_phadd_sw>, VEX_4V;
+                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                        int_x86_avx2_phsub_sw>, VEX_4V;
+                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
   defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
-                                        int_x86_avx2_pmadd_ub_sw>, VEX_4V;
+                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
-                                        int_x86_avx2_pmul_hr_sw>, VEX_4V;
+                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
 }
 
 // None of these have i8 immediate fields.
@@ -5397,8 +5490,8 @@
 let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
-  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
+  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
 let Predicates = [HasAVX2] in {
@@ -5423,7 +5516,7 @@
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
-let Predicates = [HasSSSE3] in {
+let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
@@ -5504,17 +5597,17 @@
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
-                                        int_x86_avx2_pmovsxbw>, VEX;
+                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
 defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
-                                        int_x86_avx2_pmovsxwd>, VEX;
+                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
 defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
-                                        int_x86_avx2_pmovsxdq>, VEX;
+                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
 defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
-                                        int_x86_avx2_pmovzxbw>, VEX;
+                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
 defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
-                                        int_x86_avx2_pmovzxwd>, VEX;
+                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
 defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
-                                        int_x86_avx2_pmovzxdq>, VEX;
+                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
 defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
@@ -5530,64 +5623,88 @@
             (VPMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVZXDQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load.
   def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVZXDQrm addr:$src)>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5607,7 +5724,7 @@
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
@@ -5651,13 +5768,13 @@
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
-                                       int_x86_avx2_pmovsxbd>, VEX;
+                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
 defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
-                                       int_x86_avx2_pmovsxwq>, VEX;
+                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
 defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
-                                       int_x86_avx2_pmovzxbd>, VEX;
+                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
 defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
-                                       int_x86_avx2_pmovzxwq>, VEX;
+                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
 defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
@@ -5678,7 +5795,7 @@
             (VPMOVZXWQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
             (PMOVSXBDrm addr:$src)>;
@@ -5726,9 +5843,9 @@
 }
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
-                                       int_x86_avx2_pmovsxbq>, VEX;
+                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
-                                       int_x86_avx2_pmovzxbq>, VEX;
+                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
@@ -5746,7 +5863,7 @@
             (VPMOVZXBQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbq
               (bitconvert (v4i32 (X86vzmovl
@@ -5759,6 +5876,100 @@
             (PMOVZXBQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
+            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Extract Instructions
 //===----------------------------------------------------------------------===//
@@ -5892,7 +6103,7 @@
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-          Requires<[HasSSE41]>;
+          Requires<[UseSSE41]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -6139,7 +6350,7 @@
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
                                   memopv8f32, memopv4f64,
                                   int_x86_avx_round_ps_256,
-                                  int_x86_avx_round_pd_256>, VEX;
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
@@ -6164,6 +6375,15 @@
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6173,26 +6393,33 @@
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
-def : Pat<(ffloor FR32:$src),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
-def : Pat<(f64 (ffloor FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
-def : Pat<(f32 (fnearbyint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
-def : Pat<(f64 (fnearbyint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
-def : Pat<(f32 (fceil FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
-def : Pat<(f64 (fceil FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
-def : Pat<(f32 (frint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
-def : Pat<(f64 (frint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
-def : Pat<(f32 (ftrunc FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
-def : Pat<(f64 (ftrunc FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
@@ -6213,11 +6440,11 @@
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 }
 
 let Defs = [EFLAGS] in {
@@ -6246,11 +6473,13 @@
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+                            VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+                            VEX_L;
 }
 }
 
@@ -6330,7 +6559,7 @@
           (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId256> {
   let isCommutable = 1 in
@@ -6373,25 +6602,25 @@
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
-                                        int_x86_avx2_packusdw>, VEX_4V;
+                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSB   : SS41I_binop_rm_int_y<0x38, "vpminsb",
-                                        int_x86_avx2_pmins_b>, VEX_4V;
+                                        int_x86_avx2_pmins_b>, VEX_4V, VEX_L;
   defm VPMINSD   : SS41I_binop_rm_int_y<0x39, "vpminsd",
-                                        int_x86_avx2_pmins_d>, VEX_4V;
+                                        int_x86_avx2_pmins_d>, VEX_4V, VEX_L;
   defm VPMINUD   : SS41I_binop_rm_int_y<0x3B, "vpminud",
-                                        int_x86_avx2_pminu_d>, VEX_4V;
+                                        int_x86_avx2_pminu_d>, VEX_4V, VEX_L;
   defm VPMINUW   : SS41I_binop_rm_int_y<0x3A, "vpminuw",
-                                        int_x86_avx2_pminu_w>, VEX_4V;
+                                        int_x86_avx2_pminu_w>, VEX_4V, VEX_L;
   defm VPMAXSB   : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
-                                        int_x86_avx2_pmaxs_b>, VEX_4V;
+                                        int_x86_avx2_pmaxs_b>, VEX_4V, VEX_L;
   defm VPMAXSD   : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
-                                        int_x86_avx2_pmaxs_d>, VEX_4V;
+                                        int_x86_avx2_pmaxs_d>, VEX_4V, VEX_L;
   defm VPMAXUD   : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
-                                        int_x86_avx2_pmaxu_d>, VEX_4V;
+                                        int_x86_avx2_pmaxu_d>, VEX_4V, VEX_L;
   defm VPMAXUW   : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
-                                        int_x86_avx2_pmaxu_w>, VEX_4V;
+                                        int_x86_avx2_pmaxu_w>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq>, VEX_4V;
+                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6437,9 +6666,9 @@
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6482,13 +6711,15 @@
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
                                         VR128, memopv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
+                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
                                         VR128, memopv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
+                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6503,15 +6734,15 @@
                                    VR128, memopv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                   VR256, memopv8f32, i256mem, 0>, VEX_4V;
+                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6562,13 +6793,13 @@
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            memopv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                         memopv4f64, int_x86_avx_blendv_pd_256>;
+                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            memopv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                         memopv8f32, int_x86_avx_blendv_ps_256>;
+                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
                                            memopv2i64, int_x86_sse41_pblendvb>;
@@ -6576,7 +6807,7 @@
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                           memopv4i64, int_x86_avx2_pblendvb>;
+                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6679,7 +6910,7 @@
 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
@@ -6717,7 +6948,7 @@
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         OpSize, VEX;
+                         OpSize, VEX, VEX_L;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -6753,7 +6984,7 @@
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -6771,34 +7002,31 @@
                                                   imm:$src3))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
-                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
 }
 
-let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
-  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
-  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
@@ -6809,74 +7037,103 @@
                        VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
 }
 
-let Predicates = [HasAVX],
-    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpistri<string asm> {
-    def rr : SS42AI<0x63, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x63, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpistri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPISTRI  : SS42AI_pcmpistri<"vpcmpistri">, VEX;
-defm PCMPISTRI   : SS42AI_pcmpistri<"pcmpistri">;
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
 
 // Packed Compare Explicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpestri<string asm> {
-    def rr : SS42AI<0x61, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x61, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpestri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+       imm:$src5))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPESTRI  : SS42AI_pcmpestri<"vpcmpestri">, VEX;
-defm PCMPESTRI   : SS42AI_pcmpestri<"pcmpestri">;
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - CRC Instructions
@@ -7167,27 +7424,27 @@
   def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
                                       int_x86_avx_vbroadcast_ss>;
   def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256>;
+                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256>;
+                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
-                                   int_x86_avx_vbroadcastf128_pd_256>;
+                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
 
 let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
                                            int_x86_avx2_vbroadcast_ss_ps>;
   def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                           int_x86_avx2_vbroadcast_ss_ps_256>;
+                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                          int_x86_avx2_vbroadcast_sd_pd_256>;
+                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
-                                   int_x86_avx2_vbroadcasti128>;
+                                   int_x86_avx2_vbroadcasti128>, VEX_L;
 
 let Predicates = [HasAVX] in
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
@@ -7201,50 +7458,69 @@
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
 def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
@@ -7256,64 +7532,69 @@
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
+          []>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
-}
-
-// Extract and store.
-let Predicates = [HasAVX] in {
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+          []>, VEX, VEX_L;
 }
 
 // AVX1 patterns
 let Predicates = [HasAVX] in {
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2f64 (VEXTRACTF128rr
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7331,7 +7612,7 @@
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7339,7 +7620,7 @@
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7387,13 +7668,13 @@
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>;
+                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>;
+                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7421,38 +7702,38 @@
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 imm:$src3))))]>, VEX_4V;
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
-                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
                   (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
                   (memopv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
                   (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
@@ -7503,9 +7784,9 @@
 
 let Predicates = [HasAVX, HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7537,7 +7818,7 @@
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
                                    VR128, memopv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>;
+                                    VR256, memopv4i64, i256mem>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7556,11 +7837,12 @@
                     (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX;
+                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
-                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   VEX, VEX_L;
 }
 
 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
@@ -7639,19 +7921,22 @@
 }
 
 // AVX1 broadcast patterns
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
 
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
@@ -7692,7 +7977,8 @@
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR256:$dst,
-                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, VEX_4V;
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
@@ -7700,7 +7986,7 @@
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1,
                             (bitconvert (mem_frag addr:$src2)))))]>,
-                   VEX_4V;
+                   VEX_4V, VEX_L;
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
@@ -7714,14 +8000,15 @@
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
-                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, VEX;
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, VEX;
+                              (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
@@ -7731,20 +8018,18 @@
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
-let AddedComplexity = 1 in {
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 imm:$src3))))]>, VEX_4V;
+                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
-}
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7771,31 +8056,51 @@
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7806,29 +8111,47 @@
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128:$dst,
             (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
-          VEX;
+          VEX, VEX_L;
 let neverHasSideEffects = 1, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
-          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          VEX, VEX_L;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTI128rr
                     (v8i32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTI128rr
                     (v16i16 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTI128rr
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7844,7 +8167,8 @@
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, VEX_4V;
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7852,7 +8176,7 @@
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -7890,14 +8214,14 @@
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
 }
 
 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrShiftRotate.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrShiftRotate.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrShiftRotate.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrShiftRotate.td Tue Jan 15 11:16:16 2013
@@ -839,6 +839,16 @@
 
 } // Defs = [EFLAGS]
 
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue());
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue());
+}]>;
+
 multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
 let neverHasSideEffects = 1 in {
   def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
@@ -873,4 +883,72 @@
   defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
   defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize;
   defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
 }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86InstrXOP.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86InstrXOP.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86InstrXOP.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86InstrXOP.td Tue Jan 15 11:16:16 2013
@@ -75,10 +75,10 @@
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX;
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -238,7 +238,7 @@
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
@@ -246,7 +246,7 @@
            [(set VR256:$dst,
              (Int VR256:$src1, VR256:$src2,
               (bitconvert (memopv4i64 addr:$src3))))]>,
-           VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+           VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
   def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
@@ -254,7 +254,7 @@
            [(set VR256:$dst,
              (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
               VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -287,20 +287,21 @@
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>;
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
   def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
           (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
-        VEX_W, MemOp4;
+        VEX_W, MemOp4, VEX_L;
   def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>;
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
 }
 
 defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86JITInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86JITInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86JITInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86JITInfo.cpp Tue Jan 15 11:16:16 2013
@@ -532,6 +532,15 @@
 #endif
 }
 
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
+  T Value;
+  std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
+              sizeof(T));
+  Value += Delta;
+  std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
+              sizeof(T));
+}
+
 /// relocate - Before the JIT can run a block of code that has been emitted,
 /// it must rewrite the code to contain the actual addresses of any
 /// referenced global symbols.
@@ -545,24 +554,24 @@
       // PC relative relocation, add the relocated value to the value already in
       // memory, after we adjust it for where the PC is.
       ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     }
     case X86::reloc_picrel_word: {
       // PIC base relative relocation, add the relocated value to the value
       // already in memory, after we adjust it for where the PIC base is.
       ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     }
     case X86::reloc_absolute_word:
     case X86::reloc_absolute_word_sext:
       // Absolute relocation, just add the relocated value to the value already
       // in memory.
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     case X86::reloc_absolute_dword:
-      *((intptr_t*)RelocPos) += ResultPtr;
+      addUnaligned<intptr_t>(RelocPos, ResultPtr);
       break;
     }
   }

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.cpp Tue Jan 15 11:16:16 2013
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86MCInstLower.h"
 #include "X86AsmPrinter.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
@@ -29,6 +28,31 @@
 #include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  Mangler *Mang;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
+                 X86AsmPrinter &asmprinter);
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
 X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
 : Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
@@ -43,15 +67,11 @@
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
 
-  if (!MO.isGlobal()) {
-    assert(MO.isSymbol());
-    Name += MAI.getGlobalPrefix();
-    Name += MO.getSymbolName();
-  } else {
+  if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
     bool isImplicitlyPrivate = false;
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
@@ -61,6 +81,11 @@
       isImplicitlyPrivate = true;
 
     Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  } else if (MO.isSymbol()) {
+    Name += MAI.getGlobalPrefix();
+    Name += MO.getSymbolName();
+  } else if (MO.isMBB()) {
+    Name += MO.getMBB()->getSymbol()->getName();
   }
 
   // If the target flags on the operand changes the name of the symbol, do that
@@ -191,7 +216,7 @@
   if (Expr == 0)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(Expr,
                                    MCConstantExpr::Create(MO.getOffset(), Ctx),
                                    Ctx);
@@ -324,9 +349,6 @@
       MCOp = MCOperand::CreateImm(MO.getImm());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                       MO.getMBB()->getSymbol(), Ctx));
-      break;
     case MachineOperand::MO_GlobalAddress:
     case MachineOperand::MO_ExternalSymbol:
       MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
@@ -371,18 +393,8 @@
   case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
   case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
   case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
-  case X86::SETB_C8r:     LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break;
-  case X86::SETB_C16r:    LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break;
-  case X86::SETB_C32r:    LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break;
-  case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
-  case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
-  case X86::AVX2_SET0:     LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0

Removed: llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.h?rev=172540&view=auto
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86MCInstLower.h (removed)
@@ -1,52 +0,0 @@
-//===-- X86MCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86_MCINSTLOWER_H
-#define X86_MCINSTLOWER_H
-
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class MCAsmInfo;
-  class MCContext;
-  class MCInst;
-  class MCOperand;
-  class MCSymbol;
-  class MachineInstr;
-  class MachineFunction;
-  class MachineModuleInfoMachO;
-  class MachineOperand;
-  class Mangler;
-  class TargetMachine;
-  class X86AsmPrinter;
-
-/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
-class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
-  MCContext &Ctx;
-  Mangler *Mang;
-  const MachineFunction &MF;
-  const TargetMachine &TM;
-  const MCAsmInfo &MAI;
-  X86AsmPrinter &AsmPrinter;
-public:
-  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
-                 X86AsmPrinter &asmprinter);
-
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-
-private:
-  MachineModuleInfoMachO &getMachOMMI() const;
-};
-
-}
-
-#endif

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.cpp Tue Jan 15 11:16:16 2013
@@ -106,23 +106,7 @@
 
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
-  int reg = X86_MC::getX86RegNum(i);
-  switch (i) {
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-  case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
-  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-  case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
-  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-    reg += 8;
-  }
-  return reg;
+  return getEncodingValue(i);
 }
 
 const TargetRegisterClass *
@@ -245,15 +229,26 @@
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool callsEHReturn = false;
   bool ghcCall = false;
+  bool oclBiCall = false;
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
 
   if (MF) {
     callsEHReturn = MF->getMMI().callsEHReturn();
     const Function *F = MF->getFunction();
     ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+    oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
   }
 
   if (ghcCall)
     return CSR_NoRegs_SaveList;
+  if (oclBiCall) {
+    if (HasAVX && IsWin64)
+        return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+        return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+        return CSR_64_Intel_OCL_BI_SaveList;
+  }
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -268,6 +263,16 @@
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+  if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (Is64Bit && HasAVX)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+  }
   if (CC == CallingConv::GHC)
     return CSR_NoRegs_RegMask;
   if (!Is64Bit)
@@ -277,6 +282,11 @@
   return CSR_64_RegMask;
 }
 
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -398,8 +408,9 @@
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
   unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                               F->hasFnAttr(Attribute::StackAlignment));
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getFnAttributes().hasAttribute(Attributes::StackAlignment));
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
@@ -522,7 +533,7 @@
 
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const{
+                                     int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -590,9 +601,10 @@
 }
 
 namespace llvm {
-unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: return Reg;
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
+                                bool High) {
+  switch (VT) {
+  default: llvm_unreachable("Unexpected VT");
   case MVT::i8:
     if (High) {
       switch (Reg) {
@@ -608,7 +620,7 @@
       }
     } else {
       switch (Reg) {
-      default: return 0;
+      default: llvm_unreachable("Unexpected register");
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AL;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -645,7 +657,7 @@
     }
   case MVT::i16:
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::AX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -681,7 +693,7 @@
     }
   case MVT::i32:
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::EAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -733,7 +745,7 @@
       }
     }
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::RAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.h Tue Jan 15 11:16:16 2013
@@ -58,10 +58,6 @@
 public:
   X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
 
-  /// getX86RegNum - Returns the native X86 register number for the given LLVM
-  /// register identifier.
-  static unsigned getX86RegNum(unsigned RegNo);
-
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
@@ -104,6 +100,7 @@
   /// callee-save registers on this target.
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
 
   /// getReservedRegs - Returns a bitset indexed by physical register number
   /// indicating if a register is a special register that has particular uses and
@@ -141,8 +138,8 @@
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
 // register of a specific X86 register.
-// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX
-unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false);
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
 
 } // End llvm namespace
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.td?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.td (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86RegisterInfo.td Tue Jan 15 11:16:16 2013
@@ -13,258 +13,264 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-//  Register definitions...
-//
-let Namespace = "X86" in {
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
 
-  // Subregister indices.
+// Subregister indices.
+let Namespace = "X86" in {
   def sub_8bit    : SubRegIndex;
   def sub_8bit_hi : SubRegIndex;
   def sub_16bit   : SubRegIndex;
   def sub_32bit   : SubRegIndex;
-  def sub_xmm : SubRegIndex;
+  def sub_xmm     : SubRegIndex;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
 
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
 
-  // In the register alias definitions below, we define which registers alias
-  // which others.  We only specify which registers the small registers alias,
-  // because the register file generator is smart enough to figure out that
-  // AL aliases AX if we tell it that AX aliased AL (for example).
-
-  // Dwarf numbering is different for 32-bit and 64-bit, and there are
-  // variations by target as well. Currently the first entry is for X86-64,
-  // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
-  // and debug information on X86-32/Darwin)
-
-  // 8-bit registers
-  // Low registers
-  def AL : Register<"al">;
-  def DL : Register<"dl">;
-  def CL : Register<"cl">;
-  def BL : Register<"bl">;
-
-  // X86-64 only, requires REX.
-  let CostPerUse = 1 in {
-  def SIL : Register<"sil">;
-  def DIL : Register<"dil">;
-  def BPL : Register<"bpl">;
-  def SPL : Register<"spl">;
-  def R8B  : Register<"r8b">;
-  def R9B  : Register<"r9b">;
-  def R10B : Register<"r10b">;
-  def R11B : Register<"r11b">;
-  def R12B : Register<"r12b">;
-  def R13B : Register<"r13b">;
-  def R14B : Register<"r14b">;
-  def R15B : Register<"r15b">;
-  }
-
-  // High registers. On x86-64, these cannot be used in any instruction
-  // with a REX prefix.
-  def AH : Register<"ah">;
-  def DH : Register<"dh">;
-  def CH : Register<"ch">;
-  def BH : Register<"bh">;
-
-  // 16-bit registers
-  let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
-  def AX : RegisterWithSubRegs<"ax", [AL,AH]>;
-  def DX : RegisterWithSubRegs<"dx", [DL,DH]>;
-  def CX : RegisterWithSubRegs<"cx", [CL,CH]>;
-  def BX : RegisterWithSubRegs<"bx", [BL,BH]>;
-  }
-  let SubRegIndices = [sub_8bit] in {
-  def SI : RegisterWithSubRegs<"si", [SIL]>;
-  def DI : RegisterWithSubRegs<"di", [DIL]>;
-  def BP : RegisterWithSubRegs<"bp", [BPL]>;
-  def SP : RegisterWithSubRegs<"sp", [SPL]>;
-  }
-  def IP : Register<"ip">;
-
-  // X86-64 only, requires REX.
-  let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
-  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>;
-  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>;
-  def R10W : RegisterWithSubRegs<"r10w", [R10B]>;
-  def R11W : RegisterWithSubRegs<"r11w", [R11B]>;
-  def R12W : RegisterWithSubRegs<"r12w", [R12B]>;
-  def R13W : RegisterWithSubRegs<"r13w", [R13B]>;
-  def R14W : RegisterWithSubRegs<"r14w", [R14B]>;
-  def R15W : RegisterWithSubRegs<"r15w", [R15B]>;
-  }
-  // 32-bit registers
-  let SubRegIndices = [sub_16bit] in {
-  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>;
-  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>;
-  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>;
-  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>;
-  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>;
-  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>;
-  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>;
-  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>;
-  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>;
-
-  // X86-64 only, requires REX
-  let CostPerUse = 1 in {
-  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>;
-  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>;
-  def R10D : RegisterWithSubRegs<"r10d", [R10W]>;
-  def R11D : RegisterWithSubRegs<"r11d", [R11W]>;
-  def R12D : RegisterWithSubRegs<"r12d", [R12W]>;
-  def R13D : RegisterWithSubRegs<"r13d", [R13W]>;
-  def R14D : RegisterWithSubRegs<"r14d", [R14W]>;
-  def R15D : RegisterWithSubRegs<"r15d", [R15W]>;
-  }}
-
-  // 64-bit registers, X86-64 only
-  let SubRegIndices = [sub_32bit] in {
-  def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
-  def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
-  def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
-  def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
-  def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
-  def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
-  def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
-  def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
-
-  // These also require REX.
-  let CostPerUse = 1 in {
-  def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
-  def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
-  def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
-  def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
-  def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
-  def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
-  def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
-  def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
-  def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
-  }}
-
-  // MMX Registers. These are actually aliased to ST0 .. ST7
-  def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
-  def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
-  def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
-  def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
-  def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
-  def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
-  def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
-  def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
-
-  // Pseudo Floating Point registers
-  def FP0 : Register<"fp0">;
-  def FP1 : Register<"fp1">;
-  def FP2 : Register<"fp2">;
-  def FP3 : Register<"fp3">;
-  def FP4 : Register<"fp4">;
-  def FP5 : Register<"fp5">;
-  def FP6 : Register<"fp6">;
-
-  // XMM Registers, used by the various SSE instruction set extensions.
-  def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
-  def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
-  def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
-  def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
-  def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
-  def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
-  def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
-  def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
-
-  // X86-64 only
-  let CostPerUse = 1 in {
-  def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
-  def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
-  def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
-  def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
-  def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
-  def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
-  def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
-  def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
-  } // CostPerUse
-
-  // YMM Registers, used by AVX instructions
-  let SubRegIndices = [sub_xmm] in {
-  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>;
-  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>;
-  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>;
-  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>;
-  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>;
-  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>;
-  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>;
-  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>;
-  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>;
-  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>;
-  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>;
-  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>;
-  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>;
-  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>;
-  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>;
-  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>;
-  }
-
-  class STRegister<string Name, list<Register> A> : Register<Name> {
-    let Aliases = A;
-  }
-
-  // Floating point stack registers. These don't map one-to-one to the FP
-  // pseudo registers, but we still mark them as aliasing FP registers. That
-  // way both kinds can be live without exceeding the stack depth. ST registers
-  // are only live around inline assembly.
-  def ST0 : STRegister<"st(0)", []>, DwarfRegNum<[33, 12, 11]>;
-  def ST1 : STRegister<"st(1)", [FP6]>, DwarfRegNum<[34, 13, 12]>;
-  def ST2 : STRegister<"st(2)", [FP5]>, DwarfRegNum<[35, 14, 13]>;
-  def ST3 : STRegister<"st(3)", [FP4]>, DwarfRegNum<[36, 15, 14]>;
-  def ST4 : STRegister<"st(4)", [FP3]>, DwarfRegNum<[37, 16, 15]>;
-  def ST5 : STRegister<"st(5)", [FP2]>, DwarfRegNum<[38, 17, 16]>;
-  def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
-  def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
-
-  // Floating-point status word
-  def FPSW : Register<"fpsw">;
-
-  // Status flags register
-  def EFLAGS : Register<"flags">;
-
-  // Segment registers
-  def CS : Register<"cs">;
-  def DS : Register<"ds">;
-  def SS : Register<"ss">;
-  def ES : Register<"es">;
-  def FS : Register<"fs">;
-  def GS : Register<"gs">;
-
-  // Debug registers
-  def DR0 : Register<"dr0">;
-  def DR1 : Register<"dr1">;
-  def DR2 : Register<"dr2">;
-  def DR3 : Register<"dr3">;
-  def DR4 : Register<"dr4">;
-  def DR5 : Register<"dr5">;
-  def DR6 : Register<"dr6">;
-  def DR7 : Register<"dr7">;
-
-  // Control registers
-  def CR0 : Register<"cr0">;
-  def CR1 : Register<"cr1">;
-  def CR2 : Register<"cr2">;
-  def CR3 : Register<"cr3">;
-  def CR4 : Register<"cr4">;
-  def CR5 : Register<"cr5">;
-  def CR6 : Register<"cr6">;
-  def CR7 : Register<"cr7">;
-  def CR8 : Register<"cr8">;
-  def CR9 : Register<"cr9">;
-  def CR10 : Register<"cr10">;
-  def CR11 : Register<"cr11">;
-  def CR12 : Register<"cr12">;
-  def CR13 : Register<"cr13">;
-  def CR14 : Register<"cr14">;
-  def CR15 : Register<"cr15">;
-
-  // Pseudo index registers
-  def EIZ : Register<"eiz">;
-  def RIZ : Register<"riz">;
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+} // CostPerUse
+
+// YMM Registers, used by AVX instructions
+let SubRegIndices = [sub_xmm] in {
+def YMM0:  X86Reg<"ymm0",   0, [XMM0]>,  DwarfRegAlias<XMM0>;
+def YMM1:  X86Reg<"ymm1",   1, [XMM1]>,  DwarfRegAlias<XMM1>;
+def YMM2:  X86Reg<"ymm2",   2, [XMM2]>,  DwarfRegAlias<XMM2>;
+def YMM3:  X86Reg<"ymm3",   3, [XMM3]>,  DwarfRegAlias<XMM3>;
+def YMM4:  X86Reg<"ymm4",   4, [XMM4]>,  DwarfRegAlias<XMM4>;
+def YMM5:  X86Reg<"ymm5",   5, [XMM5]>,  DwarfRegAlias<XMM5>;
+def YMM6:  X86Reg<"ymm6",   6, [XMM6]>,  DwarfRegAlias<XMM6>;
+def YMM7:  X86Reg<"ymm7",   7, [XMM7]>,  DwarfRegAlias<XMM7>;
+def YMM8:  X86Reg<"ymm8",   8, [XMM8]>,  DwarfRegAlias<XMM8>;
+def YMM9:  X86Reg<"ymm9",   9, [XMM9]>,  DwarfRegAlias<XMM9>;
+def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
+def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
+def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
+def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
+def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
+def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
 }
 
+class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
+  let Aliases = A;
+}
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
+def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
 
 //===----------------------------------------------------------------------===//
 // Register Class Definitions... now that we have all of the pieces, define the

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86SelectionDAGInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86SelectionDAGInfo.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86SelectionDAGInfo.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86SelectionDAGInfo.cpp Tue Jan 15 11:16:16 2013
@@ -54,7 +54,7 @@
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
       EVT IntPtr = TLI.getPointerTy();
-      Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.cpp Tue Jan 15 11:16:16 2013
@@ -163,17 +163,6 @@
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-/// getSpecialAddressLatency - For targets where it is beneficial to
-/// backschedule instructions that compute addresses, return a value
-/// indicating the number of scheduling cycles of backscheduling that
-/// should be attempted.
-unsigned X86Subtarget::getSpecialAddressLatency() const {
-  // For x86 out-of-order targets, back-schedule address computations so
-  // that loads and stores aren't blocked.
-  // This value was chosen arbitrarily.
-  return 200;
-}
-
 void X86Subtarget::AutoDetectSubtargetFeatures() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
@@ -246,8 +235,11 @@
     }
 
     // If it's Nehalem, unaligned memory access is fast.
-    // FIXME: Nehalem is family 6. Also include Westmere and later processors?
-    if (Family == 15 && Model == 26) {
+    // Include Westmere and Sandy Bridge as well.
+    // FIXME: add later processors.
+    if (IsIntel && ((Family == 6 && Model == 26) ||
+        (Family == 6 && Model == 44) ||
+        (Family == 6 && Model == 42))) {
       IsUAMemFast = true;
       ToggleFeature(X86::FeatureFastUAMem);
     }
@@ -310,6 +302,10 @@
         HasBMI2 = true;
         ToggleFeature(X86::FeatureBMI2);
       }
+      if (IsIntel && ((EBX >> 11) & 0x1)) {
+        HasRTM = true;
+        ToggleFeature(X86::FeatureRTM);
+      }
     }
   }
 }
@@ -338,11 +334,13 @@
   , HasLZCNT(false)
   , HasBMI(false)
   , HasBMI2(false)
+  , HasRTM(false)
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
   , HasCmpxchg16b(false)
   , UseLeaForSP(false)
+  , HasSlowDivide(false)
   , PostRAScheduler(false)
   , stackAlignment(4)
   // FIXME: this is a known good value for Yonah. How about others?
@@ -397,6 +395,10 @@
     }
   }
 
+  // CPUName may have been set by the CPU detection code. Make sure the
+  // new MCSchedModel is used.
+  InitMCProcessorInfo(CPUName, FS);
+
   if (X86ProcFamily == IntelAtom)
     PostRAScheduler = true;
 
@@ -413,12 +415,12 @@
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
     stackAlignment = StackAlignOverride;
-  else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() ||
-           isTargetSolaris() || In64BitMode)
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           In64BitMode)
     stackAlignment = 16;
 }
 

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86Subtarget.h Tue Jan 15 11:16:16 2013
@@ -118,6 +118,9 @@
   /// HasBMI2 - Processor has BMI2 instructions.
   bool HasBMI2;
 
+  /// HasRTM - Processor has RTM instructions.
+  bool HasRTM;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -136,6 +139,10 @@
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
   /// PostRAScheduler - True if using post-register-allocation scheduler.
   bool PostRAScheduler;
 
@@ -205,7 +212,8 @@
   bool hasAES() const { return HasAES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
   bool hasFMA() const { return HasFMA; }
-  bool hasFMA4() const { return HasFMA4; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
@@ -214,11 +222,13 @@
   bool hasLZCNT() const { return HasLZCNT; }
   bool hasBMI() const { return HasBMI; }
   bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
 
@@ -231,10 +241,10 @@
   bool isTargetSolaris() const {
     return TargetTriple.getOS() == Triple::Solaris;
   }
-
-  // ELF is a reasonably sane default and the only other X86 targets we
-  // support are Darwin and Windows. Just use "not those".
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetELF() const {
+    return (TargetTriple.getEnvironment() == Triple::ELF ||
+            TargetTriple.isOSBinFormatELF());
+  }
   bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
   bool isTargetNaCl() const {
     return TargetTriple.getOS() == Triple::NativeClient;
@@ -245,7 +255,10 @@
   bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
   bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
   bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
-  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetCOFF() const {
+    return (TargetTriple.getEnvironment() != Triple::ELF &&
+            TargetTriple.isOSBinFormatCOFF());
+  }
   bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
 
   bool isTargetWin64() const {
@@ -296,12 +309,6 @@
   /// returns null.
   const char *getBZeroEntry() const;
 
-  /// getSpecialAddressLatency - For targets where it is beneficial to
-  /// backschedule instructions that compute addresses, return a value
-  /// indicating the number of scheduling cycles of backscheduling that
-  /// should be attempted.
-  unsigned getSpecialAddressLatency() const;
-
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.cpp?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.cpp (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.cpp Tue Jan 15 11:16:16 2013
@@ -36,7 +36,7 @@
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
-    DataLayout(getSubtargetImpl()->isTargetDarwin() ?
+    DL(getSubtargetImpl()->isTargetDarwin() ?
                "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
                "n8:16:32-S128" :
                (getSubtargetImpl()->isTargetCygMing() ||
@@ -48,7 +48,8 @@
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
-    JITInfo(*this) {
+    JITInfo(*this),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 void X86_64TargetMachine::anchor() { }
@@ -59,12 +60,13 @@
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
-    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+    DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
                "n8:16:32:64-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
-    JITInfo(*this) {
+    JITInfo(*this),
+    STTI(&TLInfo), VTTI(&TLInfo){
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
@@ -78,7 +80,6 @@
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
-    ELFWriterInfo(is64Bit, true),
     InstrItins(Subtarget.getInstrItineraryData()){
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
@@ -113,6 +114,12 @@
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt",
+	       cl::desc("Enable early if-conversion on X86"));
+
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
@@ -142,7 +149,7 @@
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   X86PassConfig *PC = new X86PassConfig(this, PM);
 
-  if (Subtarget.hasCMov())
+  if (X86EarlyIfConv && Subtarget.hasCMov())
     PC->enablePass(&EarlyIfConverterID);
 
   return PC;

Modified: llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.h?rev=172541&r1=172540&r2=172541&view=diff
==============================================================================
--- llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.h (original)
+++ llvm/branches/AMDILBackend/lib/Target/X86/X86TargetMachine.h Tue Jan 15 11:16:16 2013
@@ -15,7 +15,6 @@
 #define X86TARGETMACHINE_H
 
 #include "X86.h"
-#include "X86ELFWriterInfo.h"
 #include "X86InstrInfo.h"
 #include "X86ISelLowering.h"
 #include "X86FrameLowering.h"
@@ -23,8 +22,9 @@
 #include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
@@ -33,7 +33,6 @@
 class X86TargetMachine : public LLVMTargetMachine {
   X86Subtarget       Subtarget;
   X86FrameLowering   FrameLowering;
-  X86ELFWriterInfo   ELFWriterInfo;
   InstrItineraryData InstrItins;
 
 public:
@@ -62,9 +61,6 @@
   virtual const X86RegisterInfo  *getRegisterInfo() const {
     return &getInstrInfo()->getRegisterInfo();
   }
-  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
-    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
-  }
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
@@ -80,17 +76,19 @@
 ///
 class X86_32TargetMachine : public X86TargetMachine {
   virtual void anchor();
-  const TargetData  DataLayout; // Calculates type size & alignment
+  const DataLayout  DL; // Calculates type size & alignment
   X86InstrInfo      InstrInfo;
   X86SelectionDAGInfo TSInfo;
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
+  ScalarTargetTransformImpl STTI;
+  X86VectorTargetTransformInfo VTTI;
 public:
   X86_32TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
-  virtual const TargetData *getTargetData() const { return &DataLayout; }
+  virtual const DataLayout *getDataLayout() const { return &DL; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
   }
@@ -103,23 +101,31 @@
   virtual       X86JITInfo       *getJITInfo()         {
     return &JITInfo;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 };
 
 /// X86_64TargetMachine - X86 64-bit target machine.
 ///
 class X86_64TargetMachine : public X86TargetMachine {
   virtual void anchor();
-  const TargetData  DataLayout; // Calculates type size & alignment
+  const DataLayout  DL; // Calculates type size & alignment
   X86InstrInfo      InstrInfo;
   X86SelectionDAGInfo TSInfo;
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
+  ScalarTargetTransformImpl STTI;
+  X86VectorTargetTransformInfo VTTI;
 public:
   X86_64TargetMachine(con