[llvm] r192474 - [DAGCombiner] Revert load slicing (r192471), until I figure out why it fails on ubuntu.

Quentin Colombet qcolombet at apple.com
Fri Oct 11 11:17:17 PDT 2013


Author: qcolombet
Date: Fri Oct 11 13:17:17 2013
New Revision: 192474

URL: http://llvm.org/viewvc/llvm-project?rev=192474&view=rev
Log:
[DAGCombiner] Revert load slicing (r192471), until I figure out why it fails on ubuntu.

Removed:
    llvm/trunk/test/CodeGen/X86/load-slice.ll
Modified:
    llvm/trunk/include/llvm/Target/TargetLowering.h
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=192474&r1=192473&r2=192474&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetLowering.h Fri Oct 11 13:17:17 2013
@@ -1183,35 +1183,6 @@ public:
     return false;
   }
 
-  /// Return true if the target supplies and combines to a paired load
-  /// two loaded values of type LoadedType next to each other in memory.
-  /// RequiredAlignment gives the minimal alignment constraints that must be met to
-  /// be able to select this paired load.
-  ///
-  /// This information is *not* used to generate actual paired loads, but it is used
-  /// to generate a sequence of loads that is easier to combine into a paired load.
-  /// For instance, something like this:
-  /// a = load i64* addr
-  /// b = trunc i64 a to i32
-  /// c = lshr i64 a, 32
-  /// d = trunc i64 c to i32
-  /// will be optimized into:
-  /// b = load i32* addr1
-  /// d = load i32* addr2
-  /// Where addr1 = addr2 +/- sizeof(i32).
-  ///
-  /// In other words, unless the target performs a post-isel load combining, this 
-  /// information should not be provided because it will generate more loads.
-  virtual bool hasPairedLoad(Type * /*LoadedType*/,
-                             unsigned & /*RequiredAligment*/) const {
-    return false;
-  }
-
-  virtual bool hasPairedLoad(EVT /*LoadedType*/,
-                             unsigned & /*RequiredAligment*/) const {
-    return false;
-  }
-
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=192474&r1=192473&r2=192474&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Fri Oct 11 13:17:17 2013
@@ -35,7 +35,6 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
@@ -45,7 +44,6 @@ STATISTIC(PreIndexedNodes , "Number of p
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
 STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
 STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
-STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
@@ -56,14 +54,6 @@ namespace {
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Include global information in alias analysis"));
 
-  /// Hidden option to stress test load slicing, i.e., when this option
-  /// is enabled, load slicing bypasses most of its profitability guards.
-  static cl::opt<bool>
-  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
-                    cl::desc("Bypass the profitability model of load "
-                             "slicing"),
-                    cl::init(false));
-
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -73,7 +63,6 @@ namespace {
     CodeGenOpt::Level OptLevel;
     bool LegalOperations;
     bool LegalTypes;
-    bool ForCodeSize;
 
     // Worklist of all of the nodes that need to be simplified.
     //
@@ -156,7 +145,6 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
-    bool SliceUpLoad(SDNode *N);
 
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
@@ -328,15 +316,8 @@ namespace {
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
-        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      AttributeSet FnAttrs =
-          DAG.getMachineFunction().getFunction()->getAttributes();
-      ForCodeSize =
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                               Attribute::OptimizeForSize) ||
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    }
+      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
 
     /// Run - runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
@@ -7598,562 +7579,9 @@ SDValue DAGCombiner::visitLOAD(SDNode *N
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
-  // Try to slice up N to more direct loads if the slices are mapped to
-  // different register banks or pairing can take place.
-  if (SliceUpLoad(N))
-    return SDValue(N, 0);
-
   return SDValue();
 }
 
-namespace {
-/// \brief Helper structure used to slice a load in smaller loads.
-/// Basically a slice is obtained from the following sequence:
-/// Origin = load Ty1, Base
-/// Shift = srl Ty1 Origin, CstTy Amount
-/// Inst = trunc Shift to Ty2
-///
-/// Then, it will be rewriten into:
-/// Slice = load SliceTy, Base + SliceOffset
-/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
-///
-/// SliceTy is deduced from the number of bits that are actually used to
-/// build Inst.
-struct LoadedSlice {
-  /// \brief Helper structure used to compute the cost of a slice.
-  struct Cost {
-    /// Are we optimizing for code size.
-    bool ForCodeSize;
-    /// Various cost.
-    unsigned Loads;
-    unsigned Truncates;
-    unsigned CrossRegisterBanksCopies;
-    unsigned ZExts;
-    unsigned Shift;
-
-    Cost(bool ForCodeSize = false)
-        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
-          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
-
-    /// \brief Get the cost of one isolated slice.
-    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
-        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
-          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
-      EVT TruncType = LS.Inst->getValueType(0);
-      EVT LoadedType = LS.getLoadedType();
-      if (TruncType != LoadedType &&
-          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
-        ZExts = 1;
-    }
-
-    /// \brief Account for slicing gain in the current cost.
-    /// Slicing provide a few gains like removing a shift or a
-    /// truncate. This method allows to grow the cost of the original
-    /// load with the gain from this slice.
-    void addSliceGain(const LoadedSlice &LS) {
-      // Each slice saves a truncate.
-      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
-      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
-                              LS.Inst->getOperand(0).getValueType()))
-        ++Truncates;
-      // If there is a shift amount, this slice gets rid of it.
-      if (LS.Shift)
-        ++Shift;
-      // If this slice can merge a cross register bank copy, account for it.
-      if (LS.canMergeExpensiveCrossRegisterBankCopy())
-        ++CrossRegisterBanksCopies;
-    }
-
-    Cost &operator+=(const Cost &RHS) {
-      Loads += RHS.Loads;
-      Truncates += RHS.Truncates;
-      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
-      ZExts += RHS.ZExts;
-      Shift += RHS.Shift;
-      return *this;
-    }
-
-    bool operator==(const Cost &RHS) const {
-      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
-             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
-             ZExts == RHS.ZExts && Shift == RHS.Shift;
-    }
-
-    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
-
-    bool operator<(const Cost &RHS) const {
-      // Assume cross register banks copies are as expensive as loads.
-      // FIXME: Do we want some more target hooks?
-      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
-      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
-      // Unless we are optimizing for code size, consider the
-      // expensive operation first.
-      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
-        return ExpensiveOpsLHS < ExpensiveOpsRHS;
-      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
-             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
-    }
-
-    bool operator>(const Cost &RHS) const { return RHS < *this; }
-
-    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
-
-    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
-  };
-  // The last instruction that represent the slice. This should be a
-  // truncate instruction.
-  SDNode *Inst;
-  // The original load instruction.
-  LoadSDNode *Origin;
-  // The right shift amount in bits from the original load.
-  unsigned Shift;
-  // The DAG from which Origin came from.
-  // This is used to get some contextual information about legal types, etc.
-  SelectionDAG *DAG;
-
-  LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
-              unsigned Shift = 0, SelectionDAG *DAG = NULL)
-      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
-
-  LoadedSlice(const LoadedSlice &LS)
-      : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
-
-  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
-  /// \return Result is \p BitWidth and has used bits set to 1 and
-  ///         not used bits set to 0.
-  APInt getUsedBits() const {
-    // Reproduce the trunc(lshr) sequence:
-    // - Start from the truncated value.
-    // - Zero extend to the desired bit width.
-    // - Shift left.
-    assert(Origin && "No original load to compare against.");
-    unsigned BitWidth = Origin->getValueSizeInBits(0);
-    assert(Inst && "This slice is not bound to an instruction");
-    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
-           "Extracted slice is bigger than the whole type!");
-    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
-    UsedBits.setAllBits();
-    UsedBits = UsedBits.zext(BitWidth);
-    UsedBits <<= Shift;
-    return UsedBits;
-  }
-
-  /// \brief Get the size of the slice to be loaded in bytes.
-  unsigned getLoadedSize() const {
-    unsigned SliceSize = getUsedBits().countPopulation();
-    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
-    return SliceSize / 8;
-  }
-
-  /// \brief Get the type that will be loaded for this slice.
-  /// Note: This may not be the final type for the slice.
-  EVT getLoadedType() const {
-    assert(DAG && "Missing context");
-    LLVMContext &Ctxt = *DAG->getContext();
-    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
-  }
-
-  /// \brief Get the alignment of the load used for this slice.
-  unsigned getAlignment() const {
-    unsigned Alignment = Origin->getAlignment();
-    unsigned Offset = getOffsetFromBase();
-    if (Offset != 0)
-      Alignment = MinAlign(Alignment, Alignment + Offset);
-    return Alignment;
-  }
-
-  /// \brief Check if this slice can be rewritten with legal operations.
-  bool isLegal() const {
-    // An invalid slice is not legal.
-    if (!Origin || !Inst || !DAG)
-      return false;
-
-    // Offsets are for indexed load only, we do not handle that.
-    if (Origin->getOffset().getOpcode() != ISD::UNDEF)
-      return false;
-
-    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-
-    // Check that the type is legal.
-    EVT SliceType = getLoadedType();
-    if (!TLI.isTypeLegal(SliceType))
-      return false;
-
-    // Check that the load is legal for this type.
-    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
-      return false;
-
-    // Check that the offset can be computed.
-    // 1. Check its type.
-    EVT PtrType = Origin->getBasePtr().getValueType();
-    if (PtrType == MVT::Untyped || PtrType.isExtended())
-      return false;
-
-    // 2. Check that it fits in the immediate.
-    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
-      return false;
-
-    // 3. Check that the computation is legal.
-    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
-      return false;
-
-    // Check that the zext is legal if it needs one.
-    EVT TruncateType = Inst->getValueType(0);
-    if (TruncateType != SliceType &&
-        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
-      return false;
-
-    return true;
-  }
-
-  /// \brief Get the offset in bytes of this slice in the original chunk of
-  /// bits.
-  /// \pre DAG != NULL.
-  uint64_t getOffsetFromBase() const {
-    assert(DAG && "Missing context.");
-    bool IsBigEndian =
-        DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
-    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
-    uint64_t Offset = Shift / 8;
-    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
-    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
-           "The size of the original loaded type is not a multiple of a"
-           " byte.");
-    // If Offset is bigger than TySizeInBytes, it means we are loading all
-    // zeros. This should have been optimized before in the process.
-    assert(TySizeInBytes > Offset &&
-           "Invalid shift amount for given loaded size");
-    if (IsBigEndian)
-      Offset = TySizeInBytes - Offset - getLoadedSize();
-    return Offset;
-  }
-
-  /// \brief Generate the sequence of instructions to load the slice
-  /// represented by this object and redirect the uses of this slice to
-  /// this new sequence of instructions.
-  /// \pre this->Inst && this->Origin are valid Instructions and this
-  /// object passed the legal check: LoadedSlice::isLegal returned true.
-  /// \return The last instruction of the sequence used to load the slice.
-  SDValue loadSlice() const {
-    assert(Inst && Origin && "Unable to replace a non-existing slice.");
-    const SDValue &OldBaseAddr = Origin->getBasePtr();
-    SDValue BaseAddr = OldBaseAddr;
-    // Get the offset in that chunk of bytes w.r.t. the endianess.
-    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
-    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
-    if (Offset) {
-      // BaseAddr = BaseAddr + Offset.
-      EVT ArithType = BaseAddr.getValueType();
-      BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
-                              DAG->getConstant(Offset, ArithType));
-    }
-
-    // Create the type of the loaded slice according to its size.
-    EVT SliceType = getLoadedType();
-
-    // Create the load for the slice.
-    SDValue LastInst = DAG->getLoad(
-        SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
-        Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
-        Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
-    // If the final type is not the same as the loaded type, this means that
-    // we have to pad with zero. Create a zero extend for that.
-    EVT FinalType = Inst->getValueType(0);
-    if (SliceType != FinalType)
-      LastInst =
-          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
-    return LastInst;
-  }
-
-  /// \brief Check if this slice can be merged with an expensive cross register
-  /// bank copy. E.g.,
-  /// i = load i32
-  /// f = bitcast i32 i to float
-  bool canMergeExpensiveCrossRegisterBankCopy() const {
-    if (!Inst || !Inst->hasOneUse())
-      return false;
-    SDNode *Use = *Inst->use_begin();
-    if (Use->getOpcode() != ISD::BITCAST)
-      return false;
-    assert(DAG && "Missing context");
-    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-    EVT ResVT = Use->getValueType(0);
-    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
-    const TargetRegisterClass *ArgRC =
-        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
-    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
-      return false;
-
-    // At this point, we know that we perform a cross-register-bank copy.
-    // Check if it is expensive.
-    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
-    // Assume bitcasts are cheap, unless both register classes do not
-    // explicitly share a common sub class.
-    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
-      return false;
-
-    // Check if it will be merged with the load.
-    // 1. Check the alignment constraint.
-    unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
-        ResVT.getTypeForEVT(*DAG->getContext()));
-
-    if (RequiredAlignment > getAlignment())
-      return false;
-
-    // 2. Check that the load is a legal operation for that type.
-    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
-      return false;
-
-    // 3. Check that we do not have a zext in the way.
-    if (Inst->getValueType(0) != getLoadedType())
-      return false;
-
-    return true;
-  }
-};
-}
-
-/// \brief Sorts LoadedSlice according to their offset.
-struct LoadedSliceSorter {
-  bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
-    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
-    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
-  }
-};
-
-/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
-/// \p UsedBits looks like 0..0 1..1 0..0.
-static bool areUsedBitsDense(const APInt &UsedBits) {
-  // If all the bits are one, this is dense!
-  if (UsedBits.isAllOnesValue())
-    return true;
-
-  // Get rid of the unused bits on the right.
-  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
-  // Get rid of the unused bits on the left.
-  if (NarrowedUsedBits.countLeadingZeros())
-    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
-  // Check that the chunk of bits is completely used.
-  return NarrowedUsedBits.isAllOnesValue();
-}
-
-/// \brief Check whether or not \p First and \p Second are next to each other
-/// in memory. This means that there is no hole between the bits loaded
-/// by \p First and the bits loaded by \p Second.
-static bool areSlicesNextToEachOther(const LoadedSlice &First,
-                                     const LoadedSlice &Second) {
-  assert(First.Origin == Second.Origin && First.Origin &&
-         "Unable to match different memory origins.");
-  APInt UsedBits = First.getUsedBits();
-  assert((UsedBits & Second.getUsedBits()) == 0 &&
-         "Slices are not supposed to overlap.");
-  UsedBits |= Second.getUsedBits();
-  return areUsedBitsDense(UsedBits);
-}
-
-/// \brief Adjust the \p GlobalLSCost according to the target
-/// paring capabilities and the layout of the slices.
-/// \pre \p GlobalLSCost should account for at least as many loads as
-/// there is in the slices in \p LoadedSlices.
-static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
-                                 LoadedSlice::Cost &GlobalLSCost) {
-  unsigned NumberOfSlices = LoadedSlices.size();
-  // If there is less than 2 elements, no pairing is possible.
-  if (NumberOfSlices < 2)
-    return;
-
-  // Sort the slices so that elements that are likely to be next to each
-  // other in memory are next to each other in the list.
-  std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
-  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
-  // First (resp. Second) is the first (resp. Second) potentially candidate
-  // to be placed in a paired load.
-  const LoadedSlice *First = NULL;
-  const LoadedSlice *Second = NULL;
-  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
-                // Set the beginning of the pair.
-                                                           First = Second) {
-
-    Second = &LoadedSlices[CurrSlice];
-
-    // If First is NULL, it means we start a new pair.
-    // Get to the next slice.
-    if (!First)
-      continue;
-
-    EVT LoadedType = First->getLoadedType();
-
-    // If the types of the slices are different, we cannot pair them.
-    if (LoadedType != Second->getLoadedType())
-      continue;
-
-    // Check if the target supplies paired loads for this type.
-    unsigned RequiredAlignment = 0;
-    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
-      // move to the next pair, this type is hopeless.
-      Second = NULL;
-      continue;
-    }
-    // Check if we meet the alignment requirement.
-    if (RequiredAlignment > First->getAlignment())
-      continue;
-
-    // Check that both loads are next to each other in memory.
-    if (!areSlicesNextToEachOther(*First, *Second))
-      continue;
-
-    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
-    --GlobalLSCost.Loads;
-    // Move to the next pair.
-    Second = NULL;
-  }
-}
-
-/// \brief Check the profitability of all involved LoadedSlice.
-/// Currently, it is considered profitable if there is exactly two
-/// involved slices (1) which are (2) next to each other in memory, and
-/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
-///
-/// Note: The order of the elements in \p LoadedSlices may be modified, but not
-/// the elements themselves.
-///
-/// FIXME: When the cost model will be mature enough, we can relax
-/// constraints (1) and (2).
-static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
-                                const APInt &UsedBits, bool ForCodeSize) {
-  unsigned NumberOfSlices = LoadedSlices.size();
-  if (StressLoadSlicing)
-    return NumberOfSlices > 1;
-
-  // Check (1).
-  if (NumberOfSlices != 2)
-    return false;
-
-  // Check (2).
-  if (!areUsedBitsDense(UsedBits))
-    return false;
-
-  // Check (3).
-  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
-  // The original code has one big load.
-  OrigCost.Loads = 1;
-  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
-    const LoadedSlice &LS = LoadedSlices[CurrSlice];
-    // Accumulate the cost of all the slices.
-    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
-    GlobalSlicingCost += SliceCost;
-
-    // Account as cost in the original configuration the gain obtained
-    // with the current slices.
-    OrigCost.addSliceGain(LS);
-  }
-
-  // If the target supports paired load, adjust the cost accordingly.
-  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
-  return OrigCost > GlobalSlicingCost;
-}
-
-/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
-/// operations, split it in the various pieces being extracted.
-///
-/// This sort of thing is introduced by SROA.
-/// This slicing takes care not to insert overlapping loads.
-/// \pre LI is a simple load (i.e., not an atomic or volatile load).
-bool DAGCombiner::SliceUpLoad(SDNode *N) {
-  if (Level < AfterLegalizeDAG)
-    return false;
-
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
-      !LD->getValueType(0).isInteger())
-    return false;
-
-  // Keep track of already used bits to detect overlapping values.
-  // In that case, we will just abort the transformation.
-  APInt UsedBits(LD->getValueSizeInBits(0), 0);
-
-  SmallVector<LoadedSlice, 4> LoadedSlices;
-
-  // Check if this load is used as several smaller chunks of bits.
-  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
-  // of computation for each trunc.
-  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-       UI != UIEnd; ++UI) {
-    // Skip the uses of the chain.
-    if (UI.getUse().getResNo() != 0)
-      continue;
-
-    SDNode *User = *UI;
-    unsigned Shift = 0;
-
-    // Check if this is a trunc(lshr).
-    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
-        isa<ConstantSDNode>(User->getOperand(1))) {
-      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
-      User = *User->use_begin();
-    }
-
-    // At this point, User is a Truncate, iff we encountered, trunc or
-    // trunc(lshr).
-    if (User->getOpcode() != ISD::TRUNCATE)
-      return false;
-
-    // The width of the type must be a power of 2 and greater than 8-bits.
-    // Otherwise the load cannot be represented in LLVM IR.
-    // Moreover, if we shifted with a non 8-bits multiple, the slice
-    // will be accross several bytes. We do not support that.
-    unsigned Width = User->getValueSizeInBits(0);
-    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
-      return 0;
-
-    // Build the slice for this chain of computations.
-    LoadedSlice LS(User, LD, Shift, &DAG);
-    APInt CurrentUsedBits = LS.getUsedBits();
-
-    // Check if this slice overlaps with another.
-    if ((CurrentUsedBits & UsedBits) != 0)
-      return false;
-    // Update the bits used globally.
-    UsedBits |= CurrentUsedBits;
-
-    // Check if the new slice would be legal.
-    if (!LS.isLegal())
-      return false;
-
-    // Record the slice.
-    LoadedSlices.push_back(LS);
-  }
-
-  // Abort slicing if it does not seem to be profitable.
-  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
-    return false;
-
-  ++SlicedLoads;
-
-  // Rewrite each chain to use an independent load.
-  // By construction, each chain can be represented by a unique load.
-
-  // Prepare the argument for the new token factor for all the slices.
-  SmallVector<SDValue, 8> ArgChains;
-  for (SmallVectorImpl<LoadedSlice>::const_iterator
-           LSIt = LoadedSlices.begin(),
-           LSItEnd = LoadedSlices.end();
-       LSIt != LSItEnd; ++LSIt) {
-    SDValue SliceInst = LSIt->loadSlice();
-    CombineTo(LSIt->Inst, SliceInst, true);
-    if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
-      SliceInst = SliceInst.getOperand(0);
-    assert(SliceInst->getOpcode() == ISD::LOAD &&
-           "It takes more than a zext to get to the loaded slice!!");
-    ArgChains.push_back(SliceInst.getValue(1));
-  }
-
-  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
-                              &ArgChains[0], ArgChains.size());
-  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
-  return true;
-}
-
 /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
 /// load is having specific bytes cleared out.  If so, return the byte size
 /// being masked out and the shift amount.

Removed: llvm/trunk/test/CodeGen/X86/load-slice.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-slice.ll?rev=192473&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/load-slice.ll (original)
+++ llvm/trunk/test/CodeGen/X86/load-slice.ll (removed)
@@ -1,140 +0,0 @@
-; RUN: llc -mtriple x86_64-apple-macosx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
-; RUN: llc -mtriple x86_64-apple-macosx < %s -o - | FileCheck %s --check-prefix=REGULAR
-;
-; <rdar://problem/14477220>
-
-%class.Complex = type { float, float }
-
-
-; Check that independant slices leads to independant loads then the slices leads to
-; different register file.
-;
-; The layout is:
-; LSB 0 1 2 3 | 4 5 6 7 MSB
-;       Low      High
-; The base address points to 0 and is 8-bytes aligned.
-; Low slice starts at 0 (base) and is 8-bytes aligned.
-; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
-;
-; STRESS-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
-; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
-; Add low slice: out[out_start].real, this is base + 0.
-; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
-; Swap Imm and Real.
-; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
-; Put the results back into out[out_start].
-; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
-;
-; Same for REGULAR, we eliminate register bank copy with each slices.
-; REGULAR-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
-; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
-; Add low slice: out[out_start].real, this is base + 0.
-; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
-; Swap Imm and Real.
-; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
-; Put the results back into out[out_start].
-; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
-define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
-  %tmp = bitcast %class.Complex* %arrayidx to i64*
-  %tmp1 = load i64* %tmp, align 8
-  %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
-  %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
-  %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
-  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
-  %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
-  %tmp4 = load float* %i.i, align 4
-  %add.i = fadd float %tmp4, %tmp2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
-  %tmp5 = load float* %r.i, align 4
-  %add5.i = fadd float %tmp5, %tmp3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
-  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
-
-; Check that we do not read outside of the chunk of bits of the original loads.
-;
-; The 64-bits should have been split in one 32-bits and one 16-bits slices.
-; The 16-bits should be zero extended to match the final type.
-;
-; The memory layout is:
-; LSB 0 1 2 3 | 4 5 | 6 7 MSB
-;      Low            High
-; The base address points to 0 and is 8-bytes aligned.
-; Low slice starts at 0 (base) and is 8-bytes aligned.
-; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
-;
-; STRESS-LABEL: t2:
-; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
-; STRESS-NEXT: addl ([[BASE]]), %eax
-; STRESS-NEXT: ret
-;
-; For the REGULAR heuristic, this is not profitable to slice things that are not
-; next to each other in memory. Here we have a hole with bytes #4-5.
-; REGULAR-LABEL: t2:
-; REGULAR: shrq $48
-define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
-  %bitcast = bitcast %class.Complex* %arrayidx to i64*
-  %chunk64 = load i64* %bitcast, align 8
-  %slice32_low = trunc i64 %chunk64 to i32
-  %shift48 = lshr i64 %chunk64, 48
-  %slice32_high = trunc i64 %shift48 to i32
-  %res = add i32 %slice32_high, %slice32_low
-  ret i32 %res
-}
-
-; Check that we do not optimize overlapping slices.
-;
-; The 64-bits should NOT have been split in as slices are overlapping.
-; First slice uses bytes numbered 0 to 3.
-; Second slice uses bytes numbered 6 and 7.
-; Third slice uses bytes numbered 4 to 7.
-;
-; STRESS-LABEL: t3:
-; STRESS: shrq $48
-; STRESS: shrq $32
-;
-; REGULAR-LABEL: t3:
-; REGULAR: shrq $48
-; REGULAR: shrq $32
-define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
-  %bitcast = bitcast %class.Complex* %arrayidx to i64*
-  %chunk64 = load i64* %bitcast, align 8
-  %slice32_low = trunc i64 %chunk64 to i32
-  %shift48 = lshr i64 %chunk64, 48
-  %slice32_high = trunc i64 %shift48 to i32
-  %shift32 = lshr i64 %chunk64, 32
-  %slice32_lowhigh = trunc i64 %shift32 to i32
-  %tmpres = add i32 %slice32_high, %slice32_low
-  %res = add i32 %slice32_lowhigh, %tmpres
-  ret i32 %res
-}
-





More information about the llvm-commits mailing list