[llvm-commits] [llvm] r100090 - in /llvm/trunk: include/llvm/Target/ lib/CodeGen/SelectionDAG/ lib/Target/PowerPC/ lib/Target/X86/ test/CodeGen/X86/

Thu Apr 1 12:49:08 PDT 2010

Hi Evan,

A few comments below.

On Mar 31, 2010, at 11:04 PM, Evan Cheng wrote:

> Author: evancheng
> Date: Thu Apr  1 01:04:33 2010
> New Revision: 100090
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=100090&view=rev
> Log:
> Fix sdisel memcpy, memset, memmove lowering:
> 1. Makes it possible to lower with floating point loads and stores.
> 2. Avoid unaligned loads / stores unless it's fast.
> 3. Fix some memcpy lowering logic bug related to when to optimize a
>   load from constant string into a constant.
> 4. Adjust x86 memcpy lowering threshold to make it more sane.
> 5. Fix x86 target hook so it uses vector and floating point memory
>   ops more effectively.
> rdar://7774704
> 
> Modified:
>    llvm/trunk/include/llvm/Target/TargetLowering.h
>    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
>    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
>    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>    llvm/trunk/lib/Target/X86/X86ISelLowering.h
>    llvm/trunk/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
>    llvm/trunk/test/CodeGen/X86/byval7.ll
>    llvm/trunk/test/CodeGen/X86/memcpy-2.ll
>    llvm/trunk/test/CodeGen/X86/memset-2.ll
>    llvm/trunk/test/CodeGen/X86/memset64-on-x86-32.ll
>    llvm/trunk/test/CodeGen/X86/small-byval-memcpy.ll
>    llvm/trunk/test/CodeGen/X86/unaligned-load.ll
> 

> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=100090&r1=100089&r2=100090&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Apr  1 01:04:33 2010

> @@ -5250,17 +5250,6 @@
>   SDValue Value = ST->getValue();
>   SDValue Ptr   = ST->getBasePtr();
> 
> -  // Try to infer better alignment information than the store already has.
> -  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
> -    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
> -      if (Align > ST->getAlignment())
> -        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
> -                                 Ptr, ST->getSrcValue(),
> -                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
> -                                 ST->isVolatile(), ST->isNonTemporal(), Align);
> -    }
> -  }
> -
>   // If this is a store of a bit convert, store the input value if the
>   // resultant store does not need a higher alignment than the original.
>   if (Value.getOpcode() == ISD::BIT_CONVERT && !ST->isTruncatingStore() &&
> @@ -5351,6 +5340,17 @@
>     }
>   }
> 
> +  // Try to infer better alignment information than the store already has.
> +  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
> +    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
> +      if (Align > ST->getAlignment())
> +        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
> +                                 Ptr, ST->getSrcValue(),
> +                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
> +                                 ST->isVolatile(), ST->isNonTemporal(), Align);
> +    }
> +  }
> +

Why did this code block move? It seems that the code it now follows
would also benefit from having more precise alignment information.

On the other hand, instcombine's alignment refinement code has been
substantially improved since this code in dagcombine was originally
written, so dagcombine may no longer need it at all.

>   if (CombinerAA) {
>     // Walk up chain skipping non-aliasing memory nodes.
>     SDValue BetterChain = FindBetterChain(N, Chain);
> 
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp?rev=100090&r1=100089&r2=100090&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp Thu Apr  1 01:04:33 2010
> @@ -3132,11 +3132,17 @@
>   if (Str.empty()) {
>     if (VT.isInteger())
>       return DAG.getConstant(0, VT);
> -    unsigned NumElts = VT.getVectorNumElements();
> -    MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
> -    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
> -                       DAG.getConstant(0,
> -                       EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts)));
> +    else if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
> +             VT.getSimpleVT().SimpleTy == MVT::f64)
> +      return DAG.getConstantFP(0.0, VT);
> +    else if (VT.isVector()) {
> +      unsigned NumElts = VT.getVectorNumElements();
> +      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
> +      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
> +                         DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(),
> +                                                             EltVT, NumElts)));
> +    } else
> +      llvm_unreachable("Expected type!");
>   }
> 
>   assert(!VT.isVector() && "Can't handle vector type here!");
> @@ -3184,51 +3190,33 @@
>   return false;
> }
> 
> -/// MeetsMaxMemopRequirement - Determines if the number of memory ops required
> -/// to replace the memset / memcpy is below the threshold. It also returns the
> -/// types of the sequence of memory ops to perform memset / memcpy.
> -static
> -bool MeetsMaxMemopRequirement(std::vector<EVT> &MemOps,
> -                              SDValue Dst, SDValue Src,
> -                              unsigned Limit, uint64_t Size, unsigned &Align,
> -                              std::string &Str, bool &isSrcStr,
> -                              SelectionDAG &DAG,
> -                              const TargetLowering &TLI) {
> -  isSrcStr = isMemSrcFromString(Src, Str);
> -  bool isSrcConst = isa<ConstantSDNode>(Src);
> -  EVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr, DAG);
> -  bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses(VT);
> -  if (VT != MVT::Other) {
> -    const Type *Ty = VT.getTypeForEVT(*DAG.getContext());
> -    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
> -    // If source is a string constant, this will require an unaligned load.
> -    if (NewAlign > Align && (isSrcConst || AllowUnalign)) {
> -      if (Dst.getOpcode() != ISD::FrameIndex) {
> -        // Can't change destination alignment. It requires a unaligned store.
> -        if (AllowUnalign)
> -          VT = MVT::Other;
> -      } else {
> -        int FI = cast<FrameIndexSDNode>(Dst)->getIndex();
> -        MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> -        if (MFI->isFixedObjectIndex(FI)) {
> -          // Can't change destination alignment. It requires a unaligned store.
> -          if (AllowUnalign)
> -            VT = MVT::Other;
> -        } else {
> -          // Give the stack frame object a larger alignment if needed.
> -          if (MFI->getObjectAlignment(FI) < NewAlign)
> -            MFI->setObjectAlignment(FI, NewAlign);
> -          Align = NewAlign;
> -        }
> -      }
> -    }
> -  }
> +/// FindOptimalMemOpLowering - Determines the optimial series memory ops
> +/// to replace the memset / memcpy. Return true if the number of memory ops
> +/// is below the threshold. It returns the types of the sequence of
> +/// memory ops to perform memset / memcpy by reference.
> +static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
> +                                     SDValue Dst, SDValue Src,
> +                                     unsigned Limit, uint64_t Size,
> +                                     unsigned DstAlign, unsigned SrcAlign,
> +                                     SelectionDAG &DAG,
> +                                     const TargetLowering &TLI) {
> +  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
> +         "Expecting memcpy / memset source to meet alignment requirement!");
> +  // If 'SrcAlign' is zero, that means the memory operation does not need load
> +  // the value, i.e. memset or memcpy from constant string. Otherwise, it's
> +  // the inferred alignment of the source. 'DstAlign', on the other hand, is the
> +  // specified alignment of the memory operation. If it is zero, that means
> +  // it's possible to change the alignment of the destination.
> +  EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign, DAG);
> 
>   if (VT == MVT::Other) {
> -    if (TLI.allowsUnalignedMemoryAccesses(MVT::i64)) {
> +    VT = TLI.getPointerTy();
> +    const Type *Ty = VT.getTypeForEVT(*DAG.getContext());
> +    if (DstAlign >= TLI.getTargetData()->getABITypeAlignment(Ty) ||
> +        TLI.allowsUnalignedMemoryAccesses(VT)) {
>       VT = MVT::i64;
>     } else {
> -      switch (Align & 7) {
> +      switch (DstAlign & 7) {
>       case 0:  VT = MVT::i64; break;
>       case 4:  VT = MVT::i32; break;
>       case 2:  VT = MVT::i16; break;
> @@ -3250,7 +3238,7 @@
>     unsigned VTSize = VT.getSizeInBits() / 8;
>     while (VTSize > Size) {
>       // For now, only use non-vector load / store's for the left-over pieces.
> -      if (VT.isVector()) {
> +      if (VT.isVector() || VT.isFloatingPoint()) {
>         VT = MVT::i64;
>         while (!TLI.isTypeLegal(VT))
>           VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
> @@ -3286,15 +3274,33 @@
>   uint64_t Limit = -1ULL;
>   if (!AlwaysInline)
>     Limit = TLI.getMaxStoresPerMemcpy();
> -  unsigned DstAlign = Align;  // Destination alignment can change.
> +  bool DstAlignCanChange = false;
> +  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> +  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
> +  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
> +    DstAlignCanChange = true;
> +  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
> +  if (Align > SrcAlign)
> +    SrcAlign = Align;
>   std::string Str;
> -  bool CopyFromStr;
> -  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
> -                                Str, CopyFromStr, DAG, TLI))
> +  bool CopyFromStr = isMemSrcFromString(Src, Str);
> +  bool isZeroStr = CopyFromStr && Str.empty();
> +  if (!FindOptimalMemOpLowering(MemOps, Dst, Src, Limit, Size,
> +                                (DstAlignCanChange ? 0 : Align),
> +                                (isZeroStr ? 0 : SrcAlign), DAG, TLI))
>     return SDValue();
> 
> +  if (DstAlignCanChange) {
> +    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
> +    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
> +    if (NewAlign > Align) {
> +      // Give the stack frame object a larger alignment if needed.
> +      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
> +        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
> +      Align = NewAlign;
> +    }
> +  }
> 
> -  bool isZeroStr = CopyFromStr && Str.empty();
>   SmallVector<SDValue, 8> OutChains;
>   unsigned NumMemOps = MemOps.size();
>   uint64_t SrcOff = 0, DstOff = 0;
> @@ -3303,16 +3309,17 @@
>     unsigned VTSize = VT.getSizeInBits() / 8;
>     SDValue Value, Store;
> 
> -    if (CopyFromStr && (isZeroStr || !VT.isVector())) {
> +    if (CopyFromStr &&
> +        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
>       // It's unlikely a store of a vector immediate can be done in a single
>       // instruction. It would require a load from a constantpool first.
> -      // We also handle store a vector with all zero's.
> +      // We only handle zero vectors here.
>       // FIXME: Handle other cases where store of vector immediate is done in
>       // a single instruction.
>       Value = getMemsetStringVal(VT, dl, DAG, TLI, Str, SrcOff);
>       Store = DAG.getStore(Chain, dl, Value,
>                            getMemBasePlusOffset(Dst, DstOff, DAG),
> -                           DstSV, DstSVOff + DstOff, false, false, DstAlign);
> +                           DstSV, DstSVOff + DstOff, false, false, Align);
>     } else {
>       // The type might not be legal for the target.  This should only happen
>       // if the type is smaller than a legal type, as on PPC, so the right
> @@ -3323,11 +3330,12 @@
>       assert(NVT.bitsGE(VT));
>       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
>                              getMemBasePlusOffset(Src, SrcOff, DAG),
> -                             SrcSV, SrcSVOff + SrcOff, VT, false, false, Align);
> +                             SrcSV, SrcSVOff + SrcOff, VT, false, false,
> +                             MinAlign(SrcAlign, SrcOff));
>       Store = DAG.getTruncStore(Chain, dl, Value,
>                                 getMemBasePlusOffset(Dst, DstOff, DAG),
>                                 DstSV, DstSVOff + DstOff, VT, false, false,
> -                                DstAlign);
> +                                Align);
>     }
>     OutChains.push_back(Store);
>     SrcOff += VTSize;
> @@ -3339,11 +3347,11 @@
> }
> 
> static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
> -                                          SDValue Chain, SDValue Dst,
> -                                          SDValue Src, uint64_t Size,
> -                                          unsigned Align, bool AlwaysInline,
> -                                          const Value *DstSV, uint64_t DstSVOff,
> -                                          const Value *SrcSV, uint64_t SrcSVOff){
> +                                        SDValue Chain, SDValue Dst,
> +                                        SDValue Src, uint64_t Size,
> +                                        unsigned Align,bool AlwaysInline,
> +                                        const Value *DstSV, uint64_t DstSVOff,
> +                                        const Value *SrcSV, uint64_t SrcSVOff) {
>   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> 
>   // Expand memmove to a series of load and store ops if the size operand falls
> @@ -3352,15 +3360,32 @@
>   uint64_t Limit = -1ULL;
>   if (!AlwaysInline)
>     Limit = TLI.getMaxStoresPerMemmove();
> -  unsigned DstAlign = Align;  // Destination alignment can change.
> -  std::string Str;
> -  bool CopyFromStr;
> -  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
> -                                Str, CopyFromStr, DAG, TLI))
> +  bool DstAlignCanChange = false;
> +  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> +  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
> +  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
> +    DstAlignCanChange = true;
> +  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
> +  if (Align > SrcAlign)
> +    SrcAlign = Align;
> +
> +  if (!FindOptimalMemOpLowering(MemOps, Dst, Src, Limit, Size,
> +                                (DstAlignCanChange ? 0 : Align),
> +                                SrcAlign, DAG, TLI))
>     return SDValue();
> 
> -  uint64_t SrcOff = 0, DstOff = 0;
> +  if (DstAlignCanChange) {
> +    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
> +    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
> +    if (NewAlign > Align) {
> +      // Give the stack frame object a larger alignment if needed.
> +      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
> +        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
> +      Align = NewAlign;
> +    }
> +  }
> 
> +  uint64_t SrcOff = 0, DstOff = 0;
>   SmallVector<SDValue, 8> LoadValues;
>   SmallVector<SDValue, 8> LoadChains;
>   SmallVector<SDValue, 8> OutChains;
> @@ -3372,7 +3397,7 @@
> 
>     Value = DAG.getLoad(VT, dl, Chain,
>                         getMemBasePlusOffset(Src, SrcOff, DAG),
> -                        SrcSV, SrcSVOff + SrcOff, false, false, Align);
> +                        SrcSV, SrcSVOff + SrcOff, false, false, SrcAlign);
>     LoadValues.push_back(Value);
>     LoadChains.push_back(Value.getValue(1));
>     SrcOff += VTSize;
> @@ -3387,7 +3412,7 @@
> 
>     Store = DAG.getStore(Chain, dl, LoadValues[i],
>                          getMemBasePlusOffset(Dst, DstOff, DAG),
> -                         DstSV, DstSVOff + DstOff, false, false, DstAlign);
> +                         DstSV, DstSVOff + DstOff, false, false, Align);
>     OutChains.push_back(Store);
>     DstOff += VTSize;
>   }
> @@ -3397,24 +3422,38 @@
> }
> 
> static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
> -                                 SDValue Chain, SDValue Dst,
> -                                 SDValue Src, uint64_t Size,
> -                                 unsigned Align,
> -                                 const Value *DstSV, uint64_t DstSVOff) {
> +                               SDValue Chain, SDValue Dst,
> +                               SDValue Src, uint64_t Size,
> +                               unsigned Align,
> +                               const Value *DstSV, uint64_t DstSVOff) {
>   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> 
>   // Expand memset to a series of load/store ops if the size operand
>   // falls below a certain threshold.
>   std::vector<EVT> MemOps;
> -  std::string Str;
> -  bool CopyFromStr;
> -  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(),
> -                                Size, Align, Str, CopyFromStr, DAG, TLI))
> +  bool DstAlignCanChange = false;
> +  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
> +  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
> +  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
> +    DstAlignCanChange = true;
> +  if (!FindOptimalMemOpLowering(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(),
> +                                Size, (DstAlignCanChange ? 0 : Align), 0,
> +                                DAG, TLI))
>     return SDValue();
> 
> +  if (DstAlignCanChange) {
> +    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
> +    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
> +    if (NewAlign > Align) {
> +      // Give the stack frame object a larger alignment if needed.
> +      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
> +        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
> +      Align = NewAlign;
> +    }
> +  }
> +
>   SmallVector<SDValue, 8> OutChains;
>   uint64_t DstOff = 0;
> -
>   unsigned NumMemOps = MemOps.size();
>   for (unsigned i = 0; i < NumMemOps; i++) {
>     EVT VT = MemOps[i];
> @@ -3445,10 +3484,9 @@
>     if (ConstantSize->isNullValue())
>       return Chain;
> 
> -    SDValue Result =
> -      getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
> -                              ConstantSize->getZExtValue(),
> -                              Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
> +    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
> +                                             ConstantSize->getZExtValue(),Align,
> +                                       false, DstSV, DstSVOff, SrcSV, SrcSVOff);
>     if (Result.getNode())
>       return Result;
>   }
> @@ -6106,8 +6144,18 @@
>   // If this is a GlobalAddress + cst, return the alignment.
>   GlobalValue *GV;
>   int64_t GVOffset = 0;
> -  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset))
> -    return MinAlign(GV->getAlignment(), GVOffset);
> +  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
> +    // If GV has specified alignment, then use it. Otherwise, use the preferred
> +    // alignment.
> +    unsigned Align = GV->getAlignment();
> +    if (!Align) {
> +      if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
> +        const TargetData *TD = TLI.getTargetData();
> +        Align = TD->getPreferredAlignment(GVar);
> +      }
> +    }

I don't think it's safe to check for the preferred alignment here. If the
variable is defined in a different translation unit, it may not be aligned
more than the ABI alignment or the GV->getAlignment() value.

The optimizer should already be upgrading global variables to their
preferred alignment when safe, so codegen shouldn't have to replicate this.

> +    return MinAlign(Align, GVOffset);
> +  }
> 
>   // If this is a direct reference to a stack slot, use information about the
>   // stack slot's alignment.
> 

> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=100090&r1=100089&r2=100090&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Apr  1 01:04:33 2010
> @@ -1012,7 +1012,7 @@
>   // FIXME: These should be based on subtarget info. Plus, the values should
>   // be smaller when we are in optimizing for size mode.
>   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
> -  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
> +  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
>   maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
>   setPrefLoopAlignment(16);
>   benefitFromCodePlacementOpt = true;
> @@ -1074,19 +1074,27 @@
> /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
> /// determining it.
> EVT
> -X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
> -                                       bool isSrcConst, bool isSrcStr,
> +X86TargetLowering::getOptimalMemOpType(uint64_t Size,
> +                                       unsigned DstAlign, unsigned SrcAlign,
>                                        SelectionDAG &DAG) const {
>   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
>   // linux.  This is because the stack realignment code can't handle certain
>   // cases like PR2962.  This should be removed when PR2962 is fixed.
>   const Function *F = DAG.getMachineFunction().getFunction();
> -  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
> -  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
> -    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
> -      return MVT::v4i32;
> -    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
> -      return MVT::v4f32;
> +  if (!F->hasFnAttr(Attribute::NoImplicitFloat)) {
> +    if (Size >= 16 &&
> +        (Subtarget->isUnalignedMemAccessFast() ||
> +         (DstAlign == 0 || DstAlign >= 16) &&
> +         (SrcAlign == 0 || SrcAlign >= 16)) &&

It's not obvious what this code intends. What does a 0 align value
mean in this context?

> +        Subtarget->getStackAlignment() >= 16) {
> +      if (Subtarget->hasSSE2())
> +        return MVT::v4i32;
> +      if (Subtarget->hasSSE1())
> +        return MVT::v4f32;
> +    } else if (Size >= 8 &&
> +               Subtarget->getStackAlignment() >= 8 &&
> +               Subtarget->hasSSE2())
> +      return MVT::f64;
>   }

Why is f64 chosen here? I guess it's because all targets where
i64 is legal have SSE2, so they wouldn't get here. That's a bit
subtle though; could you add a comment about this?

Thanks,

Dan

>   if (Subtarget->is64Bit() && Size >= 8)
>     return MVT::i64;
>