[llvm] r278927 - [LoopStrenghtReduce] Refactoring and addition of a new target cost function.

Wed Aug 17 06:24:19 PDT 2016

Author: jonpa
Date: Wed Aug 17 08:24:19 2016
New Revision: 278927

URL: http://llvm.org/viewvc/llvm-project?rev=278927&view=rev
Log:
[LoopStrenghtReduce] Refactoring and addition of a new target cost function.

Refactored so that a LSRUse owns its fixups, as oppsed to letting the
LSRInstance own them. This makes it easier to rate formulas for
LSRUses, since the fixups are available directly. The Offsets vector
has been removed since it was no longer necessary.

New target hook isFoldableMemAccessOffset(), which is used during formula
rating.

For SystemZ, this is useful to express that loads and stores with
float or vector types with a big/negative offset should be avoided in
loops. Without this, LSR will generate a lot of negative offsets that
would require extra instructions for loading the address.

Updated tests:
test/CodeGen/SystemZ/loop-01.ll

Reviewed by: Quentin Colombet and Ulrich Weigand.
https://reviews.llvm.org/D19152

Modified:
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/trunk/include/llvm/Target/TargetLowering.h
    llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
    llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/trunk/test/CodeGen/SystemZ/loop-01.ll

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Wed Aug 17 08:24:19 2016
@@ -351,6 +351,12 @@ public:
                            bool HasBaseReg, int64_t Scale,
                            unsigned AddrSpace = 0) const;
 
+  /// \brief Return true if target supports the load / store
+  /// instruction with the given Offset on the form reg + Offset. It
+  /// may be that Offset is too big for a certain type (register
+  /// class).
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const;
+  
   /// \brief Return true if it's free to truncate a value of type Ty1 to type
   /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
@@ -660,6 +666,7 @@ public:
   virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                    int64_t BaseOffset, bool HasBaseReg,
                                    int64_t Scale, unsigned AddrSpace) = 0;
+  virtual bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) = 0;
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
@@ -822,6 +829,9 @@ public:
     return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
                                      Scale, AddrSpace);
   }
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) override {
+    return Impl.isFoldableMemAccessOffset(I, Offset);
+  }
   bool isTruncateFree(Type *Ty1, Type *Ty2) override {
     return Impl.isTruncateFree(Ty1, Ty2);
   }

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h Wed Aug 17 08:24:19 2016
@@ -235,6 +235,8 @@ public:
     return -1;
   }
 
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) { return true; }
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; }
 
   bool isProfitableToHoist(Instruction *I) { return true; }

Modified: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h (original)
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h Wed Aug 17 08:24:19 2016
@@ -145,6 +145,10 @@ public:
     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
   }
 
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) {
+    return getTLI()->isFoldableMemAccessOffset(I, Offset);
+  }
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) {
     return getTLI()->isTruncateFree(Ty1, Ty2);
   }

Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetLowering.h Wed Aug 17 08:24:19 2016
@@ -1630,6 +1630,10 @@ public:
     return -1;
   }
 
+  virtual bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const {
+    return true;
+  }
+
   /// Return true if the specified immediate is legal icmp immediate, that is
   /// the target has icmp instructions which can compare a register against the
   /// immediate without having to materialize the immediate into a register.

Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Wed Aug 17 08:24:19 2016
@@ -150,6 +150,11 @@ int TargetTransformInfo::getScalingFacto
   return Cost;
 }
 
+bool TargetTransformInfo::isFoldableMemAccessOffset(Instruction *I,
+                                                    int64_t Offset) const {
+  return TTIImpl->isFoldableMemAccessOffset(I, Offset);
+}
+
 bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return TTIImpl->isTruncateFree(Ty1, Ty2);
 }

Modified: llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.cpp?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.cpp Wed Aug 17 08:24:19 2016
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/IR/Intrinsics.h"
 #include <cctype>
 
@@ -531,6 +532,28 @@ bool SystemZTargetLowering::isLegalAddre
   return AM.Scale == 0 || AM.Scale == 1;
 }
 
+bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
+                                                      int64_t Offset) const {
+  // This only applies to z13.
+  if (!Subtarget.hasVector())
+    return true;
+
+  // * Use LDE instead of LE/LEY to avoid partial register
+  //   dependencies (LDE only supports small offsets).
+  // * Utilize the vector registers to hold floating point
+  //   values (vector load / store instructions only support small
+  //   offsets).
+
+  assert (isa<LoadInst>(I) || isa<StoreInst>(I));
+  Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
+                       I->getOperand(0)->getType());
+  if (!isUInt<12>(Offset) &&
+      (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+    return false;
+
+  return true;
+}
+
 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
   if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
     return false;

Modified: llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.h?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.h (original)
+++ llvm/trunk/lib/Target/SystemZ/SystemZISelLowering.h Wed Aug 17 08:24:19 2016
@@ -394,6 +394,7 @@ public:
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *Fast) const override;

Modified: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp Wed Aug 17 08:24:19 2016
@@ -884,7 +884,6 @@ public:
                    SmallPtrSetImpl<const SCEV *> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
-                   const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
                    const LSRUse &LU,
                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
@@ -903,6 +902,143 @@ private:
                            ScalarEvolution &SE, DominatorTree &DT,
                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
 };
+  
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// The instruction which will be updated.
+  Instruction *UserInst;
+
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
+  Value *OperandValToReplace;
+
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this variable is non-null and holds the loop associated with the
+  /// induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
+  int64_t Offset;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  LSRFixup();
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
+    V.push_back(reinterpret_cast<const SCEV *>(-1));
+    return V;
+  }
+
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
+    V.push_back(reinterpret_cast<const SCEV *>(-2));
+    return V;
+  }
+
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+  }
+
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied.  TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+  /// An enum for a kind of use, indicating what types of scaled and immediate
+  /// operands it might support.
+  enum KindType {
+    Basic,   ///< A normal use, with no folding.
+    Special, ///< A special case of basic, allowing -1 scales.
+    Address, ///< An address use; folding according to TargetLowering
+    ICmpZero ///< An equality icmp with both operands folded into one.
+    // TODO: Add a generic icmp too?
+  };
+
+  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
+  KindType Kind;
+  MemAccessTy AccessTy;
+
+  /// The list of operands which are to be replaced.
+  SmallVector<LSRFixup, 8> Fixups;
+
+  /// Keep track of the min and max offsets of the fixups.
+  int64_t MinOffset;
+  int64_t MaxOffset;
+
+  /// This records whether all of the fixups using this LSRUse are outside of
+  /// the loop, in which case some special-case heuristics may be used.
+  bool AllFixupsOutsideLoop;
+
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
+  /// This records the widest use type for any fixup using this
+  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+  /// fixup widths to be equivalent, because the narrower one may be relying on
+  /// the implicit truncation to truncate away bogus bits.
+  Type *WidestFixupType;
+
+  /// A list of ways to build a value that can satisfy this user.  After the
+  /// list is populated, one of these is selected heuristically and used to
+  /// formulate a replacement for OperandValToReplace in UserInst.
+  SmallVector<Formula, 12> Formulae;
+
+  /// The set of register candidates used by all formulae in this LSRUse.
+  SmallPtrSet<const SCEV *, 4> Regs;
+
+  LSRUse(KindType K, MemAccessTy AT)
+      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+        AllFixupsOutsideLoop(true), RigidFormula(false),
+        WidestFixupType(nullptr) {}
+
+  LSRFixup &getNewFixup() {
+    Fixups.push_back(LSRFixup());
+    return Fixups.back();
+  }
+
+  void pushFixup(LSRFixup &f) {
+    Fixups.push_back(f);
+    if (f.Offset > MaxOffset)
+      MaxOffset = f.Offset;
+    if (f.Offset < MinOffset)
+      MinOffset = f.Offset;
+  }
+  
+  bool HasFormulaWithSameRegs(const Formula &F) const;
+  bool InsertFormula(const Formula &F);
+  void DeleteFormula(Formula &F);
+  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
 
 }
 
@@ -976,7 +1112,6 @@ void Cost::RateFormula(const TargetTrans
                        SmallPtrSetImpl<const SCEV *> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
-                       const SmallVectorImpl<int64_t> &Offsets,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
@@ -1014,13 +1149,20 @@ void Cost::RateFormula(const TargetTrans
   ScaleCost += getScalingFactorCost(TTI, LU, F);
 
   // Tally up the non-zero immediates.
-  for (int64_t O : Offsets) {
+  for (const LSRFixup &Fixup : LU.Fixups) {
+    int64_t O = Fixup.Offset;
     int64_t Offset = (uint64_t)O + F.BaseOffset;
     if (F.BaseGV)
       ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
     else if (Offset != 0)
       ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+    // Check with target if this offset with this instruction is
+    // specifically not supported.
+    if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
+        !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
+      NumBaseAdds++;
   }
   assert(isValid() && "invalid cost");
 }
@@ -1067,44 +1209,8 @@ void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
 
-namespace {
-
-/// An operand value in an instruction which is to be replaced with some
-/// equivalent, possibly strength-reduced, replacement.
-struct LSRFixup {
-  /// The instruction which will be updated.
-  Instruction *UserInst;
-
-  /// The operand of the instruction which will be replaced. The operand may be
-  /// used more than once; every instance will be replaced.
-  Value *OperandValToReplace;
-
-  /// If this user is to use the post-incremented value of an induction
-  /// variable, this variable is non-null and holds the loop associated with the
-  /// induction variable.
-  PostIncLoopSet PostIncLoops;
-
-  /// The index of the LSRUse describing the expression which this fixup needs,
-  /// minus an offset (below).
-  size_t LUIdx;
-
-  /// A constant offset to be added to the LSRUse expression.  This allows
-  /// multiple fixups to share the same LSRUse with different offsets, for
-  /// example in an unrolled loop.
-  int64_t Offset;
-
-  bool isUseFullyOutsideLoop(const Loop *L) const;
-
-  LSRFixup();
-
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
-
-}
-
 LSRFixup::LSRFixup()
-  : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
+  : UserInst(nullptr), OperandValToReplace(nullptr),
     Offset(0) {}
 
 /// Test whether this fixup always uses its value outside of the given loop.
@@ -1140,9 +1246,6 @@ void LSRFixup::print(raw_ostream &OS) co
     PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   }
 
-  if (LUIdx != ~size_t(0))
-    OS << ", LUIdx=" << LUIdx;
-
   if (Offset != 0)
     OS << ", Offset=" << Offset;
 }
@@ -1152,102 +1255,6 @@ void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
 
-namespace {
-
-/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
-/// SmallVectors of const SCEV*.
-struct UniquifierDenseMapInfo {
-  static SmallVector<const SCEV *, 4> getEmptyKey() {
-    SmallVector<const SCEV *, 4>  V;
-    V.push_back(reinterpret_cast<const SCEV *>(-1));
-    return V;
-  }
-
-  static SmallVector<const SCEV *, 4> getTombstoneKey() {
-    SmallVector<const SCEV *, 4> V;
-    V.push_back(reinterpret_cast<const SCEV *>(-2));
-    return V;
-  }
-
-  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
-    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
-  }
-
-  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
-                      const SmallVector<const SCEV *, 4> &RHS) {
-    return LHS == RHS;
-  }
-};
-
-/// This class holds the state that LSR keeps for each use in IVUsers, as well
-/// as uses invented by LSR itself. It includes information about what kinds of
-/// things can be folded into the user, information about the user itself, and
-/// information about how the use may be satisfied.  TODO: Represent multiple
-/// users of the same expression in common?
-class LSRUse {
-  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
-
-public:
-  /// An enum for a kind of use, indicating what types of scaled and immediate
-  /// operands it might support.
-  enum KindType {
-    Basic,   ///< A normal use, with no folding.
-    Special, ///< A special case of basic, allowing -1 scales.
-    Address, ///< An address use; folding according to TargetLowering
-    ICmpZero ///< An equality icmp with both operands folded into one.
-    // TODO: Add a generic icmp too?
-  };
-
-  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
-
-  KindType Kind;
-  MemAccessTy AccessTy;
-
-  SmallVector<int64_t, 8> Offsets;
-  int64_t MinOffset;
-  int64_t MaxOffset;
-
-  /// This records whether all of the fixups using this LSRUse are outside of
-  /// the loop, in which case some special-case heuristics may be used.
-  bool AllFixupsOutsideLoop;
-
-  /// RigidFormula is set to true to guarantee that this use will be associated
-  /// with a single formula--the one that initially matched. Some SCEV
-  /// expressions cannot be expanded. This allows LSR to consider the registers
-  /// used by those expressions without the need to expand them later after
-  /// changing the formula.
-  bool RigidFormula;
-
-  /// This records the widest use type for any fixup using this
-  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
-  /// fixup widths to be equivalent, because the narrower one may be relying on
-  /// the implicit truncation to truncate away bogus bits.
-  Type *WidestFixupType;
-
-  /// A list of ways to build a value that can satisfy this user.  After the
-  /// list is populated, one of these is selected heuristically and used to
-  /// formulate a replacement for OperandValToReplace in UserInst.
-  SmallVector<Formula, 12> Formulae;
-
-  /// The set of register candidates used by all formulae in this LSRUse.
-  SmallPtrSet<const SCEV *, 4> Regs;
-
-  LSRUse(KindType K, MemAccessTy AT)
-      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
-        AllFixupsOutsideLoop(true), RigidFormula(false),
-        WidestFixupType(nullptr) {}
-
-  bool HasFormulaWithSameRegs(const Formula &F) const;
-  bool InsertFormula(const Formula &F);
-  void DeleteFormula(Formula &F);
-  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
-
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
-
-}
-
 /// Test whether this use as a formula which has the same registers as the given
 /// formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1335,9 +1342,9 @@ void LSRUse::print(raw_ostream &OS) cons
 
   OS << ", Offsets={";
   bool NeedComma = false;
-  for (int64_t O : Offsets) {
+  for (const LSRFixup &Fixup : Fixups) {
     if (NeedComma) OS << ',';
-    OS << O;
+    OS << Fixup.Offset;
     NeedComma = true;
   }
   OS << '}';
@@ -1644,9 +1651,6 @@ class LSRInstance {
   /// Interesting use types, to facilitate truncation reuse.
   SmallSetVector<Type *, 4> Types;
 
-  /// The list of operands which are to be replaced.
-  SmallVector<LSRFixup, 16> Fixups;
-
   /// The list of interesting uses.
   SmallVector<LSRUse, 16> Uses;
 
@@ -1679,11 +1683,6 @@ class LSRInstance {
   void CollectInterestingTypesAndFactors();
   void CollectFixupsAndInitialFormulae();
 
-  LSRFixup &getNewFixup() {
-    Fixups.push_back(LSRFixup());
-    return Fixups.back();
-  }
-
   // Support for sharing of LSRUses between LSRFixups.
   typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
   UseMapTy UseMap;
@@ -1753,16 +1752,16 @@ class LSRInstance {
                                   const LSRUse &LU,
                                   SCEVExpander &Rewriter) const;
 
-  Value *Expand(const LSRFixup &LF,
+  Value *Expand(const LSRUse &LU, const LSRFixup &LF,
                 const Formula &F,
                 BasicBlock::iterator IP,
                 SCEVExpander &Rewriter,
                 SmallVectorImpl<WeakVH> &DeadInsts) const;
-  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
                      const Formula &F,
                      SCEVExpander &Rewriter,
                      SmallVectorImpl<WeakVH> &DeadInsts) const;
-  void Rewrite(const LSRFixup &LF,
+  void Rewrite(const LSRUse &LU, const LSRFixup &LF,
                const Formula &F,
                SCEVExpander &Rewriter,
                SmallVectorImpl<WeakVH> &DeadInsts) const;
@@ -2262,8 +2261,6 @@ bool LSRInstance::reconcileNewOffset(LSR
   LU.MinOffset = NewMinOffset;
   LU.MaxOffset = NewMaxOffset;
   LU.AccessTy = NewAccessTy;
-  if (NewOffset != LU.Offsets.back())
-    LU.Offsets.push_back(NewOffset);
   return true;
 }
 
@@ -2300,11 +2297,6 @@ std::pair<size_t, int64_t> LSRInstance::
   Uses.push_back(LSRUse(Kind, AccessTy));
   LSRUse &LU = Uses[LUIdx];
 
-  // We don't need to track redundant offsets, but we don't need to go out
-  // of our way here to avoid them.
-  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
-    LU.Offsets.push_back(Offset);
-
   LU.MinOffset = Offset;
   LU.MaxOffset = Offset;
   return std::make_pair(LUIdx, Offset);
@@ -2958,33 +2950,28 @@ void LSRInstance::CollectFixupsAndInitia
     if (IVIncSet.count(UseI))
       continue;
 
-    // Record the uses.
-    LSRFixup &LF = getNewFixup();
-    LF.UserInst = UserInst;
-    LF.OperandValToReplace = U.getOperandValToReplace();
-    LF.PostIncLoops = U.getPostIncLoops();
-
     LSRUse::KindType Kind = LSRUse::Basic;
     MemAccessTy AccessTy;
-    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
+    if (isAddressUse(UserInst, U.getOperandValToReplace())) {
       Kind = LSRUse::Address;
-      AccessTy = getAccessType(LF.UserInst);
+      AccessTy = getAccessType(UserInst);
     }
 
     const SCEV *S = IU.getExpr(U);
-
+    PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+    
     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
     // (N - i == 0), and this allows (N - i) to be the expression that we work
     // with rather than just N or i, so we can consider the register
     // requirements for both N and i at the same time. Limiting this code to
     // equality icmps is not a problem because all interesting loops use
     // equality icmps, thanks to IndVarSimplify.
-    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
       if (CI->isEquality()) {
         // Swap the operands if needed to put the OperandValToReplace on the
         // left, for consistency.
         Value *NV = CI->getOperand(1);
-        if (NV == LF.OperandValToReplace) {
+        if (NV == U.getOperandValToReplace()) {
           CI->setOperand(1, CI->getOperand(0));
           CI->setOperand(0, NV);
           NV = CI->getOperand(1);
@@ -2997,7 +2984,7 @@ void LSRInstance::CollectFixupsAndInitia
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, nullptr,
-                                     LF.PostIncLoops, SE, DT);
+                                     TmpPostIncLoops, SE, DT);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -3010,12 +2997,20 @@ void LSRInstance::CollectFixupsAndInitia
         Factors.insert(-1);
       }
 
-    // Set up the initial formula for this use.
+    // Get or create an LSRUse.
     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
-    LF.LUIdx = P.first;
-    LF.Offset = P.second;
-    LSRUse &LU = Uses[LF.LUIdx];
+    size_t LUIdx = P.first;
+    int64_t Offset = P.second;
+    LSRUse &LU = Uses[LUIdx];
+
+    // Record the fixup.
+    LSRFixup &LF = LU.getNewFixup();
+    LF.UserInst = UserInst;
+    LF.OperandValToReplace = U.getOperandValToReplace();
+    LF.PostIncLoops = TmpPostIncLoops;
+    LF.Offset = Offset;
     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
     if (!LU.WidestFixupType ||
         SE.getTypeSizeInBits(LU.WidestFixupType) <
         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
@@ -3023,8 +3018,8 @@ void LSRInstance::CollectFixupsAndInitia
 
     // If this is the first use of this LSRUse, give it a formula.
     if (LU.Formulae.empty()) {
-      InsertInitialFormula(S, LU, LF.LUIdx);
-      CountRegisters(LU.Formulae.back(), LF.LUIdx);
+      InsertInitialFormula(S, LU, LUIdx);
+      CountRegisters(LU.Formulae.back(), LUIdx);
     }
   }
 
@@ -3150,20 +3145,21 @@ LSRInstance::CollectLoopInvariantFixupsA
             continue;
         }
 
-        LSRFixup &LF = getNewFixup();
-        LF.UserInst = const_cast<Instruction *>(UserInst);
-        LF.OperandValToReplace = U;
         std::pair<size_t, int64_t> P = getUse(
             S, LSRUse::Basic, MemAccessTy());
-        LF.LUIdx = P.first;
-        LF.Offset = P.second;
-        LSRUse &LU = Uses[LF.LUIdx];
+        size_t LUIdx = P.first;
+        int64_t Offset = P.second;
+        LSRUse &LU = Uses[LUIdx];
+        LSRFixup &LF = LU.getNewFixup();
+        LF.UserInst = const_cast<Instruction *>(UserInst);
+        LF.OperandValToReplace = U;
+        LF.Offset = Offset;
         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
         if (!LU.WidestFixupType ||
             SE.getTypeSizeInBits(LU.WidestFixupType) <
             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
           LU.WidestFixupType = LF.OperandValToReplace->getType();
-        InsertSupplementalFormula(US, LU, LF.LUIdx);
+        InsertSupplementalFormula(US, LU, LUIdx);
         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
         break;
       }
@@ -3892,8 +3888,7 @@ void LSRInstance::FilterOutUndesirableDe
       // the corresponding bad register from the Regs set.
       Cost CostF;
       Regs.clear();
-      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
-                        &LoserRegs);
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
         // by uses within other loops that have some non-trivial address mode or
@@ -3926,8 +3921,7 @@ void LSRInstance::FilterOutUndesirableDe
 
         Cost CostBest;
         Regs.clear();
-        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
-                             DT, LU);
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4073,25 +4067,13 @@ void LSRInstance::NarrowSearchSpaceByCol
 
       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
 
-      // Update the relocs to reference the new use.
-      for (LSRFixup &Fixup : Fixups) {
-        if (Fixup.LUIdx == LUIdx) {
-          Fixup.LUIdx = LUThatHas - &Uses.front();
-          Fixup.Offset += F.BaseOffset;
-          // Add the new offset to LUThatHas' offset list.
-          if (LUThatHas->Offsets.back() != Fixup.Offset) {
-            LUThatHas->Offsets.push_back(Fixup.Offset);
-            if (Fixup.Offset > LUThatHas->MaxOffset)
-              LUThatHas->MaxOffset = Fixup.Offset;
-            if (Fixup.Offset < LUThatHas->MinOffset)
-              LUThatHas->MinOffset = Fixup.Offset;
-          }
-          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
-        }
-        if (Fixup.LUIdx == NumUses-1)
-          Fixup.LUIdx = LUIdx;
+      // Transfer the fixups of LU to LUThatHas.
+      for (LSRFixup &Fixup : LU.Fixups) {
+        Fixup.Offset += F.BaseOffset;
+        LUThatHas->pushFixup(Fixup);
+        DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
       }
-
+      
       // Delete formulae from the new use which are no longer legal.
       bool Any = false;
       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
@@ -4265,8 +4247,7 @@ void LSRInstance::SolveRecurse(SmallVect
     // the current best, prune the search at that point.
     NewCost = CurCost;
     NewRegs = CurRegs;
-    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
-                        LU);
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
     if (NewCost < SolutionCost) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
@@ -4449,12 +4430,12 @@ LSRInstance::AdjustInsertPositionForExpa
 
 /// Emit instructions for the leading candidate expression for this LSRUse (this
 /// is called "expanding").
-Value *LSRInstance::Expand(const LSRFixup &LF,
+Value *LSRInstance::Expand(const LSRUse &LU,
+                           const LSRFixup &LF,
                            const Formula &F,
                            BasicBlock::iterator IP,
                            SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakVH> &DeadInsts) const {
-  const LSRUse &LU = Uses[LF.LUIdx];
   if (LU.RigidFormula)
     return LF.OperandValToReplace;
 
@@ -4636,6 +4617,7 @@ Value *LSRInstance::Expand(const LSRFixu
 /// effectively happens in their predecessor blocks, so the expression may need
 /// to be expanded in multiple places.
 void LSRInstance::RewriteForPHI(PHINode *PN,
+                                const LSRUse &LU,
                                 const LSRFixup &LF,
                                 const Formula &F,
                                 SCEVExpander &Rewriter,
@@ -4689,7 +4671,7 @@ void LSRInstance::RewriteForPHI(PHINode
       if (!Pair.second)
         PN->setIncomingValue(i, Pair.first->second);
       else {
-        Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
+        Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
                               Rewriter, DeadInsts);
 
         // If this is reuse-by-noop-cast, insert the noop cast.
@@ -4710,17 +4692,18 @@ void LSRInstance::RewriteForPHI(PHINode
 /// Emit instructions for the leading candidate expression for this LSRUse (this
 /// is called "expanding"), and update the UserInst to reference the newly
 /// expanded value.
-void LSRInstance::Rewrite(const LSRFixup &LF,
+void LSRInstance::Rewrite(const LSRUse &LU,
+                          const LSRFixup &LF,
                           const Formula &F,
                           SCEVExpander &Rewriter,
                           SmallVectorImpl<WeakVH> &DeadInsts) const {
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
-    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
+    RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
   } else {
     Value *FullV =
-        Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+      Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
     Type *OpTy = LF.OperandValToReplace->getType();
@@ -4736,7 +4719,7 @@ void LSRInstance::Rewrite(const LSRFixup
     // its new value may happen to be equal to LF.OperandValToReplace, in
     // which case doing replaceUsesOfWith leads to replacing both operands
     // with the same value. TODO: Reorganize this.
-    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
+    if (LU.Kind == LSRUse::ICmpZero)
       LF.UserInst->setOperand(0, FullV);
     else
       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
@@ -4769,11 +4752,11 @@ void LSRInstance::ImplementSolution(
   }
 
   // Expand the new value definitions and update the users.
-  for (const LSRFixup &Fixup : Fixups) {
-    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
-
-    Changed = true;
-  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+    for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+      Changed = true;
+    }
 
   for (const IVChain &Chain : IVChainVec) {
     GenerateIVChain(Chain, Rewriter, DeadInsts);
@@ -4917,11 +4900,12 @@ void LSRInstance::print_factors_and_type
 
 void LSRInstance::print_fixups(raw_ostream &OS) const {
   OS << "LSR is examining the following fixup sites:\n";
-  for (const LSRFixup &LF : Fixups) {
-    dbgs() << "  ";
-    LF.print(OS);
-    OS << '\n';
-  }
+  for (const LSRUse &LU : Uses)
+    for (const LSRFixup &LF : LU.Fixups) {
+      dbgs() << "  ";
+      LF.print(OS);
+      OS << '\n';
+    }
 }
 
 void LSRInstance::print_uses(raw_ostream &OS) const {

Modified: llvm/trunk/test/CodeGen/SystemZ/loop-01.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/loop-01.ll?rev=278927&r1=278926&r2=278927&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/loop-01.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/loop-01.ll Wed Aug 17 08:24:19 2016
@@ -1,6 +1,8 @@
 ; Test loop tuning.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-Z13
 
 ; Test that strength reduction is applied to addresses with a scale factor,
 ; but that indexed addressing can still be used.
@@ -122,3 +124,118 @@ loop.next:
 exit:
   ret void
 }
+
+; Test that negative offsets are avoided for loads of floating point.
+%s.float = type { float, float, float }
+define void @f5(%s.float* nocapture %a,
+                %s.float* nocapture readonly %b,
+                i32 zeroext %S) {
+; CHECK-Z13-LABEL: f5:
+; CHECK-Z13-NOT: -{{[0-9]+}}(%r
+
+entry:
+  %cmp9 = icmp eq i32 %S, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                 ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:          ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                   ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                           ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a1 = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 0
+  %tmp = load float, float* %a1, align 4
+  %b4 = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 1
+  %tmp1 = load float, float* %b4, align 4
+  %add = fadd float %tmp, %tmp1
+  %c = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 2
+  %tmp2 = load float, float* %c, align 4
+  %add7 = fadd float %add, %tmp2
+  %a10 = getelementptr inbounds %s.float, %s.float* %a, i64 %indvars.iv, i32 0
+  store float %add7, float* %a10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %S
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Test that negative offsets are avoided for loads of double.
+%s.double = type { double, double, double }
+define void @f6(%s.double* nocapture %a,
+                %s.double* nocapture readonly %b,
+                i32 zeroext %S) {
+; CHECK-Z13-LABEL: f6:
+; CHECK-Z13-NOT: -{{[0-9]+}}(%r
+entry:
+  %cmp9 = icmp eq i32 %S, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                  ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:           ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                    ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                            ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a1 = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 0
+  %tmp = load double, double* %a1, align 4
+  %b4 = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 1
+  %tmp1 = load double, double* %b4, align 4
+  %add = fadd double %tmp, %tmp1
+  %c = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 2
+  %tmp2 = load double, double* %c, align 4
+  %add7 = fadd double %add, %tmp2
+  %a10 = getelementptr inbounds %s.double, %s.double* %a, i64 %indvars.iv, i32 0
+  store double %add7, double* %a10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %S
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Test that negative offsets are avoided for memory accesses of vector type.
+%s.vec = type { <4 x i32>, <4 x i32>, <4 x i32> }
+define void @f7(%s.vec* nocapture %a,
+                %s.vec* nocapture readonly %b,
+                i32 zeroext %S) {
+; CHECK-Z13-LABEL: f7:
+; CHECK-Z13-NOT: -{{[0-9]+}}(%r
+entry:
+  %cmp9 = icmp eq i32 %S, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                 ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:          ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                   ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                           ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a1 = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 0
+  %tmp = load <4 x i32>, <4 x i32>* %a1, align 4
+  %b4 = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 1
+  %tmp1 = load <4 x i32>, <4 x i32>* %b4, align 4
+  %add = add <4 x i32> %tmp1, %tmp
+  %c = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 2
+  %tmp2 = load <4 x i32>, <4 x i32>* %c, align 4
+  %add7 = add <4 x i32> %add, %tmp2
+  %a10 = getelementptr inbounds %s.vec, %s.vec* %a, i64 %indvars.iv, i32 0
+  store <4 x i32> %add7, <4 x i32>* %a10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %S
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}