[llvm-commits] [llvm] r65108 - in /llvm/trunk: lib/Transforms/Scalar/LoopStrengthReduce.cpp test/CodeGen/X86/full-lsr.ll test/CodeGen/X86/iv-users-in-other-loops.ll test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll

Thu Feb 19 20:17:50 PST 2009

Author: djg
Date: Thu Feb 19 22:17:46 2009
New Revision: 65108

URL: http://llvm.org/viewvc/llvm-project?rev=65108&view=rev
Log:
Implement "superhero" strength reduction, or full strength
reduction of address calculations down to basic pointer arithmetic.
This is currently off by default, as it needs a few other features
before it becomes generally useful. And even when enabled, full
strength reduction is only performed when it doesn't increase
register pressure, and when several other conditions are true.

This also factors out a bunch of exisiting LSR code out of
StrengthReduceStridedIVUsers into separate functions, and tidies
up IV insertion. This actually decreases register pressure even
in non-superhero mode. The change in iv-users-in-other-loops.ll
is an example of this; there are two more adds because there are
two fewer leas, and there is less spilling.

Added:
    llvm/trunk/test/CodeGen/X86/full-lsr.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/trunk/test/CodeGen/X86/iv-users-in-other-loops.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll

Modified: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=65108&r1=65107&r2=65108&view=diff

==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp Thu Feb 19 22:17:46 2009
@@ -35,6 +35,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 #include <set>
@@ -46,6 +47,10 @@
 STATISTIC(NumEliminated,  "Number of strides eliminated");
 STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
 
+static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
+                                       cl::init(false),
+                                       cl::Hidden);
+
 namespace {
 
   struct BasedUser;
@@ -208,6 +213,29 @@
                               bool &AllUsesAreAddresses,
                               bool &AllUsesAreOutsideLoop,
                               std::vector<BasedUser> &UsersToProcess);
+    bool ShouldUseFullStrengthReductionMode(
+                                const std::vector<BasedUser> &UsersToProcess,
+                                const Loop *L,
+                                bool AllUsesAreAddresses,
+                                SCEVHandle Stride);
+    void PrepareToStrengthReduceFully(
+                             std::vector<BasedUser> &UsersToProcess,
+                             SCEVHandle Stride,
+                             SCEVHandle CommonExprs,
+                             const Loop *L,
+                             SCEVExpander &PreheaderRewriter);
+    void PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt);
+    void PrepareToStrengthReduceWithNewPhi(
+                                  std::vector<BasedUser> &UsersToProcess,
+                                  SCEVHandle Stride,
+                                  SCEVHandle CommonExprs,
+                                  Value *CommonBaseV,
+                                  const Loop *L,
+                                  SCEVExpander &PreheaderRewriter);
     void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
                                       IVUsersOfOneStride &Uses,
                                       Loop *L, bool isOnlyStride);
@@ -641,6 +669,13 @@
     /// instruction.
     SCEVHandle Imm;
 
+    /// Phi - The induction variable that performs the striding that
+    /// should be used for this user.
+    Value *Phi;
+
+    /// IncV - The post-incremented value of Phi.
+    Value *IncV;
+
     // isUseOfPostIncrementedValue - True if this should use the
     // post-incremented version of this IV, not the preincremented version.
     // This can only be set in special cases, such as the terminating setcc
@@ -1445,6 +1480,272 @@
   return CommonExprs;
 }
 
+/// ShouldUseFullStrengthReductionMode - Test whether full strength-reduction
+/// is valid and profitable for the given set of users of a stride. In
+/// full strength-reduction mode, all addresses at the current stride are
+/// strength-reduced all the way down to pointer arithmetic.
+///
+bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
+                                   const std::vector<BasedUser> &UsersToProcess,
+                                   const Loop *L,
+                                   bool AllUsesAreAddresses,
+                                   SCEVHandle Stride) {
+  if (!EnableFullLSRMode)
+    return false;
+
+  // The heuristics below aim to avoid increasing register pressure, but
+  // fully strength-reducing all the addresses increases the number of
+  // add instructions, so don't do this when optimizing for size.
+  // TODO: If the loop is large, the savings due to simpler addresses
+  // may oughtweight the costs of the extra increment instructions.
+  if (L->getHeader()->getParent()->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  // TODO: For now, don't do full strength reduction if there could
+  // potentially be greater-stride multiples of the current stride
+  // which could reuse the current stride IV.
+  if (StrideOrder.back() != Stride)
+    return false;
+
+  // Iterate through the uses to find conditions that automatically rule out
+  // full-lsr mode.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    SCEV *Base = UsersToProcess[i].Base;
+    SCEV *Imm = UsersToProcess[i].Imm;
+    // If any users have a loop-variant component, they can't be fully
+    // strength-reduced.
+    if (Imm && !Imm->isLoopInvariant(L))
+      return false;
+    // If there are to users with the same base and the difference between
+    // the two Imm values can't be folded into the address, full
+    // strength reduction would increase register pressure.
+    do {
+      SCEV *CurImm = UsersToProcess[i].Imm;
+      if (CurImm || Imm && CurImm != Imm) {
+        if (!CurImm) CurImm = SE->getIntegerSCEV(0, Stride->getType());
+        if (!Imm)       Imm = SE->getIntegerSCEV(0, Stride->getType());
+        const Instruction *Inst = UsersToProcess[i].Inst;
+        const Type *UseTy = Inst->getType();
+        if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
+          UseTy = SI->getOperand(0)->getType();
+        SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+        if (!Diff->isZero() &&
+            (!AllUsesAreAddresses ||
+             !fitsInAddressMode(Diff, UseTy, TLI, /*HasBaseReg=*/true)))
+          return false;
+      }
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+
+  // If there's exactly one user in this stride, fully strength-reducing it
+  // won't increase register pressure. If it's starting from a non-zero base,
+  // it'll be simpler this way.
+  if (UsersToProcess.size() == 1 && !UsersToProcess[0].Base->isZero())
+    return true;
+
+  // Otherwise, if there are any users in this stride that don't require
+  // a register for their base, full strength-reduction will increase
+  // register pressure.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    if (!UsersToProcess[i].Base ||
+        UsersToProcess[i].Base->isZero())
+      return false;
+
+  // Otherwise, go for it.
+  return true;
+}
+
+/// InsertAffinePhi Create and insert a PHI node for an induction variable
+/// with the specified start and step values in the specified loop.
+///
+/// If NegateStride is true, the stride should be negated by using a
+/// subtract instead of an add.
+///
+/// Return the created phi node, and return the step instruction by
+/// reference in IncV.
+///
+static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+                                const Loop *L,
+                                SCEVExpander &Rewriter,
+                                Value *&IncV) {
+  assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
+  assert(Step->isLoopInvariant(L) && "New PHI stride is not loop invariant!");
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  PHINode *PN = PHINode::Create(Start->getType(), "lsr.iv", Header->begin());
+  PN->addIncoming(Rewriter.expandCodeFor(Start, Preheader->getTerminator()),
+                  Preheader);
+
+  pred_iterator HPI = pred_begin(Header);
+  assert(HPI != pred_end(Header) && "Loop with zero preds???");
+  if (!L->contains(*HPI)) ++HPI;
+  assert(HPI != pred_end(Header) && L->contains(*HPI) &&
+         "No backedge in loop?");
+
+  // If the stride is negative, insert a sub instead of an add for the
+  // increment.
+  bool isNegative = isNonConstantNegative(Step);
+  SCEVHandle IncAmount = Step;
+  if (isNegative)
+    IncAmount = Rewriter.SE.getNegativeSCEV(Step);
+
+  // Insert an add instruction right before the terminator corresponding
+  // to the back-edge.
+  Value *StepV = Rewriter.expandCodeFor(IncAmount, Preheader->getTerminator());
+  if (isNegative) {
+    IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
+                                     (*HPI)->getTerminator());
+  } else {
+    IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
+                                     (*HPI)->getTerminator());
+  }
+  if (!isa<ConstantInt>(StepV)) ++NumVariable;
+
+  pred_iterator PI = pred_begin(Header);
+  if (*PI == L->getLoopPreheader())
+    ++PI;
+  PN->addIncoming(IncV, *PI);
+
+  ++NumInserted;
+  return PN;
+}
+
+static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
+  // We want to emit code for users inside the loop first.  To do this, we
+  // rearrange BasedUser so that the entries at the end have
+  // isUseOfPostIncrementedValue = false, because we pop off the end of the
+  // vector (so we handle them first).
+  std::partition(UsersToProcess.begin(), UsersToProcess.end(),
+                 PartitionByIsUseOfPostIncrementedValue);
+
+  // Sort this by base, so that things with the same base are handled
+  // together.  By partitioning first and stable-sorting later, we are
+  // guaranteed that within each base we will pop off users from within the
+  // loop before users outside of the loop with a particular base.
+  //
+  // We would like to use stable_sort here, but we can't.  The problem is that
+  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // we don't have anything to do a '<' comparison on.  Because we think the
+  // number of uses is small, do a horrible bubble sort which just relies on
+  // ==.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // Get a base value.
+    SCEVHandle Base = UsersToProcess[i].Base;
+
+    // Compact everything with this base to be consecutive with this one.
+    for (unsigned j = i+1; j != e; ++j) {
+      if (UsersToProcess[j].Base == Base) {
+        std::swap(UsersToProcess[i+1], UsersToProcess[j]);
+        ++i;
+      }
+    }
+  }
+}
+
+/// PrepareToStrengthReduceFully - Prepare to fully strength-reduce UsersToProcess,
+/// meaning lowering addresses all the way down to direct pointer arithmetic.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFully(
+                                        std::vector<BasedUser> &UsersToProcess,
+                                        SCEVHandle Stride,
+                                        SCEVHandle CommonExprs,
+                                        const Loop *L,
+                                        SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Fully reducing all users\n";
+
+  // Rewrite the UsersToProcess records, creating a separate PHI for each
+  // unique Base value.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    // TODO: The uses are grouped by base, but not sorted. We arbitrarily
+    // pick the first Imm value here to start with, and adjust it for the
+    // other uses.
+    SCEVHandle Imm = UsersToProcess[i].Imm;
+    SCEVHandle Base = UsersToProcess[i].Base;
+    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
+    Value *IncV;
+    PHINode *Phi = InsertAffinePhi(Start, Stride, L,
+                                   PreheaderRewriter,
+                                   IncV);
+    // Loop over all the users with the same base.
+    do {
+      UsersToProcess[i].Base = SE->getIntegerSCEV(0, Stride->getType());
+      UsersToProcess[i].Imm = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+      UsersToProcess[i].Phi = Phi;
+      UsersToProcess[i].IncV = IncV;
+      assert(UsersToProcess[i].Imm->isLoopInvariant(L) &&
+             "ShouldUseFullStrengthReductionMode should reject this!");
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+}
+
+/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
+/// given users to share.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         SCEVHandle Stride,
+                                         SCEVHandle CommonExprs,
+                                         Value *CommonBaseV,
+                                         const Loop *L,
+                                         SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Inserting new PHI:\n";
+
+  Value *IncV;
+  PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
+                                 Stride, L,
+                                 PreheaderRewriter,
+                                 IncV);
+
+  // Remember this in case a later stride is multiple of this.
+  IVsByStride[Stride].addIV(Stride, CommonExprs, Phi, IncV);
+
+  // All the users will share this new IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    UsersToProcess[i].Phi = Phi;
+    UsersToProcess[i].IncV = IncV;
+  }
+
+  DOUT << "    IV=";
+  DEBUG(WriteAsOperand(*DOUT, Phi, /*PrintType=*/false));
+  DOUT << ", INC=";
+  DEBUG(WriteAsOperand(*DOUT, IncV, /*PrintType=*/false));
+  DOUT << "\n";
+}
+
+/// PrepareToStrengthReduceWithNewPhi - Prepare for the given users to reuse
+/// an induction variable with a stride that is a factor of the current
+/// induction variable.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt) {
+  DOUT << "  Rewriting in terms of existing IV of STRIDE " << *ReuseIV.Stride
+       << " and BASE " << *ReuseIV.Base << "\n";
+
+  // All the users will share the reused IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    UsersToProcess[i].Phi = ReuseIV.PHI;
+    UsersToProcess[i].IncV = ReuseIV.IncV;
+  }
+
+  Constant *C = dyn_cast<Constant>(CommonBaseV);
+  if (C &&
+      (!C->isNullValue() &&
+       !fitsInAddressMode(SE->getUnknown(CommonBaseV), CommonBaseV->getType(),
+                         TLI, false)))
+    // We want the common base emitted into the preheader! This is just
+    // using cast as a copy so BitCast (no-op cast) is appropriate
+    CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(),
+                                  "commonbase", PreInsertPt);
+}
+
 /// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
 /// stride of IV.  All of the users may have different starting values, and this
 /// may not be the only stride (we know it is if isOnlyStride is true).
@@ -1476,29 +1777,18 @@
                                           AllUsesAreOutsideLoop,
                                           UsersToProcess);
 
+  // Sort the UsersToProcess array so that users with common bases are
+  // next to each other.
+  SortUsersToProcess(UsersToProcess);
+
   // If we managed to find some expressions in common, we'll need to carry
   // their value in a register and add it in for each use. This will take up
   // a register operand, which potentially restricts what stride values are
   // valid.
   bool HaveCommonExprs = !CommonExprs->isZero();
-  
-  // If all uses are addresses, check if it is possible to reuse an IV with a
-  // stride that is a factor of this stride. And that the multiple is a number
-  // that can be encoded in the scale field of the target addressing mode. And
-  // that we will have a valid instruction after this substition, including the
-  // immediate field, if any.
-  PHINode *NewPHI = NULL;
-  Value   *IncV   = NULL;
-  IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
-                   SE->getIntegerSCEV(0, Type::Int32Ty),
-                   0, 0);
-  SCEVHandle RewriteFactor = 
-                  CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
-                                  AllUsesAreOutsideLoop,
-                                  Stride, ReuseIV, CommonExprs->getType(),
-                                  UsersToProcess);
+
   const Type *ReplacedTy = CommonExprs->getType();
-  
+
   // Now that we know what we need to do, insert the PHI node itself.
   //
   DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
@@ -1507,103 +1797,48 @@
 
   SCEVExpander Rewriter(*SE, *LI);
   SCEVExpander PreheaderRewriter(*SE, *LI);
-  
+
   BasicBlock  *Preheader = L->getLoopPreheader();
   Instruction *PreInsertPt = Preheader->getTerminator();
-  Instruction *PhiInsertBefore = L->getHeader()->begin();
   BasicBlock *LatchBlock = L->getLoopLatch();
 
-  // Emit the initial base value into the loop preheader.
-  Value *CommonBaseV
-    = PreheaderRewriter.expandCodeFor(CommonExprs, PreInsertPt);
-
-  if (isa<SCEVConstant>(RewriteFactor) &&
-      cast<SCEVConstant>(RewriteFactor)->isZero()) {
-    // Create a new Phi for this base, and stick it in the loop header.
-    NewPHI = PHINode::Create(ReplacedTy, "iv.", PhiInsertBefore);
-    ++NumInserted;
-  
-    // Add common base to the new Phi node.
-    NewPHI->addIncoming(CommonBaseV, Preheader);
-
-    // If the stride is negative, insert a sub instead of an add for the
-    // increment.
-    bool isNegative = isNonConstantNegative(Stride);
-    SCEVHandle IncAmount = Stride;
-    if (isNegative)
-      IncAmount = SE->getNegativeSCEV(Stride);
-    
-    // Insert the stride into the preheader.
-    Value *StrideV = PreheaderRewriter.expandCodeFor(IncAmount, PreInsertPt);
-    if (!isa<ConstantInt>(StrideV)) ++NumVariable;
-
-    // Emit the increment of the base value before the terminator of the loop
-    // latch block, and add it to the Phi node.
-    SCEVHandle IncExp = SE->getUnknown(StrideV);
-    if (isNegative)
-      IncExp = SE->getNegativeSCEV(IncExp);
-    IncExp = SE->getAddExpr(SE->getUnknown(NewPHI), IncExp);
-  
-    IncV = Rewriter.expandCodeFor(IncExp, LatchBlock->getTerminator());
-    IncV->setName(NewPHI->getName()+".inc");
-    NewPHI->addIncoming(IncV, LatchBlock);
-
-    // Remember this in case a later stride is multiple of this.
-    IVsByStride[Stride].addIV(Stride, CommonExprs, NewPHI, IncV);
-
-    DOUT << "  Inserted new PHI: IV=";
-    DEBUG(WriteAsOperand(*DOUT, NewPHI, /*PrintType=*/false));
-    DOUT << ", INC=";
-    DEBUG(WriteAsOperand(*DOUT, IncV, /*PrintType=*/false));
-    DOUT << "\n";
+  Value *CommonBaseV = ConstantInt::get(ReplacedTy, 0);
+
+  SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
+  IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
+                   SE->getIntegerSCEV(0, Type::Int32Ty),
+                   0, 0);
+
+  /// Choose a strength-reduction strategy and prepare for it by creating
+  /// the necessary PHIs and adjusting the bookkeeping.
+  if (ShouldUseFullStrengthReductionMode(UsersToProcess, L,
+                                         AllUsesAreAddresses, Stride)) {
+    PrepareToStrengthReduceFully(UsersToProcess, Stride, CommonExprs, L,
+                                 PreheaderRewriter);
   } else {
-    DOUT << "  Rewriting in terms of existing IV of STRIDE " << *ReuseIV.Stride
-         << " and BASE " << *ReuseIV.Base << "\n";
-    NewPHI = ReuseIV.PHI;
-    IncV   = ReuseIV.IncV;
-
-    Constant *C = dyn_cast<Constant>(CommonBaseV);
-    if (!C ||
-        (!C->isNullValue() &&
-         !fitsInAddressMode(SE->getUnknown(CommonBaseV), ReplacedTy, 
-                           TLI, false)))
-      // We want the common base emitted into the preheader! This is just
-      // using cast as a copy so BitCast (no-op cast) is appropriate
-      CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(), 
-                                    "commonbase", PreInsertPt);
-  }
+    // Emit the initial base value into the loop preheader.
+    CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, PreInsertPt);
 
-  // We want to emit code for users inside the loop first.  To do this, we
-  // rearrange BasedUser so that the entries at the end have
-  // isUseOfPostIncrementedValue = false, because we pop off the end of the
-  // vector (so we handle them first).
-  std::partition(UsersToProcess.begin(), UsersToProcess.end(),
-                 PartitionByIsUseOfPostIncrementedValue);
-  
-  // Sort this by base, so that things with the same base are handled
-  // together.  By partitioning first and stable-sorting later, we are
-  // guaranteed that within each base we will pop off users from within the
-  // loop before users outside of the loop with a particular base.
-  //
-  // We would like to use stable_sort here, but we can't.  The problem is that
-  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
-  // we don't have anything to do a '<' comparison on.  Because we think the
-  // number of uses is small, do a horrible bubble sort which just relies on
-  // ==.
-  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
-    // Get a base value.
-    SCEVHandle Base = UsersToProcess[i].Base;
-    
-    // Compact everything with this base to be consecutive with this one.
-    for (unsigned j = i+1; j != e; ++j) {
-      if (UsersToProcess[j].Base == Base) {
-        std::swap(UsersToProcess[i+1], UsersToProcess[j]);
-        ++i;
-      }
-    }
+    // If all uses are addresses, check if it is possible to reuse an IV with a
+    // stride that is a factor of this stride. And that the multiple is a number
+    // that can be encoded in the scale field of the target addressing mode. And
+    // that we will have a valid instruction after this substition, including the
+    // immediate field, if any.
+    RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
+                                    AllUsesAreOutsideLoop,
+                                    Stride, ReuseIV, CommonExprs->getType(),
+                                    UsersToProcess);
+    if (isa<SCEVConstant>(RewriteFactor) &&
+        cast<SCEVConstant>(RewriteFactor)->isZero())
+      PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
+                                        CommonBaseV, L, PreheaderRewriter);
+    else
+      PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
+                                               ReuseIV, PreInsertPt);
   }
 
-  // Process all the users now.  This outer loop handles all bases, the inner
+  // Process all the users now, replacing their strided uses with
+  // strength-reduced forms.  This outer loop handles all bases, the inner
   // loop handles all users of a particular base.
   while (!UsersToProcess.empty()) {
     SCEVHandle Base = UsersToProcess.back().Base;
@@ -1643,9 +1878,9 @@
 
       // If this instruction wants to use the post-incremented value, move it
       // after the post-inc and use its value instead of the PHI.
-      Value *RewriteOp = NewPHI;
+      Value *RewriteOp = User.Phi;
       if (User.isUseOfPostIncrementedValue) {
-        RewriteOp = IncV;
+        RewriteOp = User.IncV;
 
         // If this user is in the loop, make sure it is the last thing in the
         // loop to ensure it is dominated by the increment.
@@ -1670,7 +1905,7 @@
       // PHI node, we can use the later point to expand the final
       // RewriteExpr.
       Instruction *NewBasePt = dyn_cast<Instruction>(RewriteOp);
-      if (RewriteOp == NewPHI) NewBasePt = 0;
+      if (RewriteOp == User.Phi) NewBasePt = 0;
 
       // Clear the SCEVExpander's expression map so that we are guaranteed
       // to have the code emitted where we expect it.

Added: llvm/trunk/test/CodeGen/X86/full-lsr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/full-lsr.ll?rev=65108&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/full-lsr.ll (added)
+++ llvm/trunk/test/CodeGen/X86/full-lsr.ll Thu Feb 19 22:17:46 2009
@@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | llc -march=x86 -enable-full-lsr >%t
+; RUN: grep {addl	\\\$4,} %t | count 3
+; RUN: not grep {,%} %t
+
+define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
+entry:
+	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
+	br i1 %0, label %bb, label %return
+
+bb:		; preds = %bb, %entry
+	%i.03 = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]		; <i32> [#uses=5]
+	%1 = getelementptr float* %A, i32 %i.03		; <float*> [#uses=1]
+	%2 = load float* %1, align 4		; <float> [#uses=1]
+	%3 = getelementptr float* %B, i32 %i.03		; <float*> [#uses=1]
+	%4 = load float* %3, align 4		; <float> [#uses=1]
+	%5 = add float %2, %4		; <float> [#uses=1]
+	%6 = getelementptr float* %C, i32 %i.03		; <float*> [#uses=1]
+	store float %5, float* %6, align 4
+	%7 = add i32 %i.03, 10		; <i32> [#uses=3]
+	%8 = getelementptr float* %A, i32 %7		; <float*> [#uses=1]
+	%9 = load float* %8, align 4		; <float> [#uses=1]
+	%10 = getelementptr float* %B, i32 %7		; <float*> [#uses=1]
+	%11 = load float* %10, align 4		; <float> [#uses=1]
+	%12 = add float %9, %11		; <float> [#uses=1]
+	%13 = getelementptr float* %C, i32 %7		; <float*> [#uses=1]
+	store float %12, float* %13, align 4
+	%indvar.next = add i32 %i.03, 1		; <i32> [#uses=2]
+	%exitcond = icmp eq i32 %indvar.next, %N		; <i1> [#uses=1]
+	br i1 %exitcond, label %return, label %bb
+
+return:		; preds = %bb, %entry
+	ret void
+}

Modified: llvm/trunk/test/CodeGen/X86/iv-users-in-other-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/iv-users-in-other-loops.ll?rev=65108&r1=65107&r2=65108&view=diff

==============================================================================
--- llvm/trunk/test/CodeGen/X86/iv-users-in-other-loops.ll (original)
+++ llvm/trunk/test/CodeGen/X86/iv-users-in-other-loops.ll Thu Feb 19 22:17:46 2009
@@ -1,6 +1,8 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 -f -o %t
 ; RUN: grep inc %t | count 2
-; RUN: grep addq %t | count 11
+; RUN: grep addq %t | count 13
+; RUN: grep leaq %t | count 10
+; RUN: grep movq %t | count 5
 
 ; IV users in each of the loops from other loops shouldn't cause LSR
 ; to insert new induction variables. Previously it would create a

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll?rev=65108&r1=65107&r2=65108&view=diff

==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll Thu Feb 19 22:17:46 2009
@@ -1,5 +1,5 @@
 ; RUN: llvm-as < %s | opt -loop-reduce | llvm-dis | \
-; RUN:   grep {add i32 %iv.*inc, 1}
+; RUN:   grep {add i32 %lsr.iv.next, 1}
 ;
 ; Make sure that the use of the IV outside of the loop (the store) uses the 
 ; post incremented value of the IV, not the preincremented value.  This