[llvm-commits] [llvm] r43231 - in /llvm/trunk: lib/Transforms/Scalar/LoopStrengthReduce.cpp test/CodeGen/X86/2007-08-10-LEA16Use32.ll test/CodeGen/X86/stride-nine-with-base-reg.ll test/CodeGen/X86/stride-reuse.ll

Mon Oct 22 13:40:42 PDT 2007

Author: djg
Date: Mon Oct 22 15:40:42 2007
New Revision: 43231

URL: http://llvm.org/viewvc/llvm-project?rev=43231&view=rev
Log:
Strength reduction improvements.

 - Avoid attempting stride-reuse in the case that there are users that
   aren't addresses. In that case, there will be places where the
   multiplications won't be folded away, so it's better to try to
   strength-reduce them.

 - Several SSE intrinsics have operands that strength-reduction can
   treat as addresses. The previous item makes this more visible, as
   any non-address use of an IV can inhibit stride-reuse.

 - Make ValidStride aware of whether there's likely to be a base
   register in the address computation. This prevents it from thinking
   that things like stride 9 are valid on x86 when the base register is
   already occupied.

Also, XFAIL the 2007-08-10-LEA16Use32.ll test; the new logic to avoid
stride-reuse elimintes the LEA in the loop, so the test is no longer
testing what it was intended to test.

Added:
    llvm/trunk/test/CodeGen/X86/stride-nine-with-base-reg.ll
    llvm/trunk/test/CodeGen/X86/stride-reuse.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/trunk/test/CodeGen/X86/2007-08-10-LEA16Use32.ll

Modified: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=43231&r1=43230&r2=43231&view=diff

==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp Mon Oct 22 15:40:42 2007
@@ -175,10 +175,12 @@
     bool FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse,
                        const SCEVHandle *&CondStride);
 
-    unsigned CheckForIVReuse(const SCEVHandle&, IVExpr&, const Type*,
+    unsigned CheckForIVReuse(bool, const SCEVHandle&,
+                             IVExpr&, const Type*,
                              const std::vector<BasedUser>& UsersToProcess);
 
-    bool ValidStride(int64_t, const std::vector<BasedUser>& UsersToProcess);
+    bool ValidStride(bool, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
 
     void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
                                       IVUsersOfOneStride &Uses,
@@ -937,8 +939,8 @@
 
 /// isZero - returns true if the scalar evolution expression is zero.
 ///
-static bool isZero(SCEVHandle &V) {
-  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
+static bool isZero(const SCEVHandle &V) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
     return SC->getValue()->isZero();
   return false;
 }
@@ -946,7 +948,8 @@
 /// ValidStride - Check whether the given Scale is valid for all loads and 
 /// stores in UsersToProcess.
 ///
-bool LoopStrengthReduce::ValidStride(int64_t Scale, 
+bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
+                               int64_t Scale, 
                                const std::vector<BasedUser>& UsersToProcess) {
   for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
     // If this is a load or other access, pass the type of the access in.
@@ -959,6 +962,7 @@
     TargetLowering::AddrMode AM;
     if (SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
       AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.HasBaseReg = HasBaseReg || !isZero(UsersToProcess[i].Base);
     AM.Scale = Scale;
 
     // If load[imm+r*scale] is illegal, bail out.
@@ -970,9 +974,11 @@
 
 /// CheckForIVReuse - Returns the multiple if the stride is the multiple
 /// of a previous stride and it is a legal value for the target addressing
-/// mode scale component. This allows the users of this stride to be rewritten
-/// as prev iv * factor. It returns 0 if no reuse is possible.
-unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride, 
+/// mode scale component and optional base reg. This allows the users of
+/// this stride to be rewritten as prev iv * factor. It returns 0 if no
+/// reuse is possible.
+unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+                                const SCEVHandle &Stride, 
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
   if (!TLI) return 0;
@@ -992,7 +998,7 @@
       // stores; if it can be used for some and not others, we might as well use
       // the original stride everywhere, since we have to create the IV for it
       // anyway.
-      if (ValidStride(Scale, UsersToProcess))
+      if (ValidStride(HasBaseReg, Scale, UsersToProcess))
         for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
                IE = SI->second.IVs.end(); II != IE; ++II)
           // FIXME: Only handle base == 0 for now.
@@ -1061,7 +1067,18 @@
   // UsersToProcess base values.
   SCEVHandle CommonExprs =
     RemoveCommonExpressionsFromUseBases(UsersToProcess, SE);
-  
+
+  // If we managed to find some expressions in common, we'll need to carry
+  // their value in a register and add it in for each use. This will take up
+  // a register operand, which potentially restricts what stride values are
+  // valid.
+  bool HaveCommonExprs = !isZero(CommonExprs);
+  
+  // Keep track if every use in UsersToProcess is an address. If they all are,
+  // we may be able to rewrite the entire collection of them in terms of a
+  // smaller-stride IV.
+  bool AllUsesAreAddresses = true;
+
   // Next, figure out what we can represent in the immediate fields of
   // instructions.  If we can represent anything there, move it to the imm
   // fields of the BasedUsers.  We do this so that it increases the commonality
@@ -1085,29 +1102,53 @@
           isAddress = true;
       } else if (IntrinsicInst *II =
                    dyn_cast<IntrinsicInst>(UsersToProcess[i].Inst)) {
-        // Addressing modes can also be folded into prefetches.
-        if (II->getIntrinsicID() == Intrinsic::prefetch &&
-            II->getOperand(1) == UsersToProcess[i].OperandValToReplace)
-          isAddress = true;
+        // Addressing modes can also be folded into prefetches and a variety
+        // of intrinsics.
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::prefetch:
+        case Intrinsic::x86_sse2_loadu_dq:
+        case Intrinsic::x86_sse2_loadu_pd:
+        case Intrinsic::x86_sse_loadu_ps:
+        case Intrinsic::x86_sse_storeu_ps:
+        case Intrinsic::x86_sse2_storeu_pd:
+        case Intrinsic::x86_sse2_storeu_dq:
+        case Intrinsic::x86_sse2_storel_dq:
+          if (II->getOperand(1) == UsersToProcess[i].OperandValToReplace)
+            isAddress = true;
+          break;
+        case Intrinsic::x86_sse2_loadh_pd:
+        case Intrinsic::x86_sse2_loadl_pd:
+          if (II->getOperand(2) == UsersToProcess[i].OperandValToReplace)
+            isAddress = true;
+          break;
+        }
       }
+
+      // If this use isn't an address, then not all uses are addresses.
+      if (!isAddress)
+        AllUsesAreAddresses = false;
       
       MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
                           UsersToProcess[i].Imm, isAddress, L, SE);
     }
   }
 
-  // Check if it is possible to reuse a IV with stride that is factor of this
-  // stride. And the multiple is a number that can be encoded in the scale
-  // field of the target addressing mode.  And we will have a valid
-  // instruction after this substition, including the immediate field, if any.
+  // If all uses are addresses, check if it is possible to reuse an IV with a
+  // stride that is a factor of this stride. And that the multiple is a number
+  // that can be encoded in the scale field of the target addressing mode. And
+  // that we will have a valid instruction after this substition, including the
+  // immediate field, if any.
   PHINode *NewPHI = NULL;
   Value   *IncV   = NULL;
   IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
                    SE->getIntegerSCEV(0, Type::Int32Ty),
                    0, 0);
-  unsigned RewriteFactor = CheckForIVReuse(Stride, ReuseIV,
-                                           CommonExprs->getType(),
-                                           UsersToProcess);
+  unsigned RewriteFactor = 0;
+  if (AllUsesAreAddresses)
+    RewriteFactor = CheckForIVReuse(HaveCommonExprs, Stride, ReuseIV,
+                                    CommonExprs->getType(),
+                                    UsersToProcess);
   if (RewriteFactor != 0) {
     DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride
          << " and BASE " << *ReuseIV.Base << " :\n";

Modified: llvm/trunk/test/CodeGen/X86/2007-08-10-LEA16Use32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2007-08-10-LEA16Use32.ll?rev=43231&r1=43230&r2=43231&view=diff

==============================================================================
--- llvm/trunk/test/CodeGen/X86/2007-08-10-LEA16Use32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2007-08-10-LEA16Use32.ll Mon Oct 22 15:40:42 2007
@@ -1,4 +1,8 @@
 ; RUN: llvm-as < %s | llc -march=x86 | grep {leal}
+; XFAIL: *
+; This test is XFAIL'd because strength-reduction was improved to
+; avoid emitting the lea, so it longer tests whether the 16-bit
+; lea is avoided.
 
 @X = global i16 0               ; <i16*> [#uses=1]
 @Y = global i16 0               ; <i16*> [#uses=1]

Added: llvm/trunk/test/CodeGen/X86/stride-nine-with-base-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stride-nine-with-base-reg.ll?rev=43231&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/stride-nine-with-base-reg.ll (added)
+++ llvm/trunk/test/CodeGen/X86/stride-nine-with-base-reg.ll Mon Oct 22 15:40:42 2007
@@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | llc -march=x86 | grep lea | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea
+
+; For x86 there's an lea above the loop. In both cases, there shouldn't
+; be any lea instructions inside the loop.
+
+ at B = external global [1000 x i8], align 32
+ at A = external global [1000 x i8], align 32
+ at P = external global [1000 x i8], align 32
+
+define void @foo(i32 %m, i32 %p) {
+entry:
+	%tmp1 = icmp sgt i32 %m, 0
+	br i1 %tmp1, label %bb, label %return
+
+bb:
+	%i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ]
+	%tmp2 = getelementptr [1000 x i8]* @B, i32 0, i32 %i.019.0
+	%tmp3 = load i8* %tmp2, align 4
+	%tmp4 = mul i8 %tmp3, 2
+	%tmp5 = getelementptr [1000 x i8]* @A, i32 0, i32 %i.019.0
+	store i8 %tmp4, i8* %tmp5, align 4
+	%tmp8 = mul i32 %i.019.0, 9
+        %tmp0 = add i32 %tmp8, %p
+	%tmp10 = getelementptr [1000 x i8]* @P, i32 0, i32 %tmp0
+	store i8 17, i8* %tmp10, align 4
+	%indvar.next = add i32 %i.019.0, 1
+	%exitcond = icmp eq i32 %indvar.next, %m
+	br i1 %exitcond, label %return, label %bb
+
+return:
+	ret void
+}
+

Added: llvm/trunk/test/CodeGen/X86/stride-reuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stride-reuse.ll?rev=43231&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/stride-reuse.ll (added)
+++ llvm/trunk/test/CodeGen/X86/stride-reuse.ll Mon Oct 22 15:40:42 2007
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llc -march=x86 | not grep lea
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea
+
+ at B = external global [1000 x float], align 32
+ at A = external global [1000 x float], align 32
+ at P = external global [1000 x i32], align 32
+
+define void @foo(i32 %m) {
+entry:
+	%tmp1 = icmp sgt i32 %m, 0
+	br i1 %tmp1, label %bb, label %return
+
+bb:
+	%i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ]
+	%tmp2 = getelementptr [1000 x float]* @B, i32 0, i32 %i.019.0
+	%tmp3 = load float* %tmp2, align 4
+	%tmp4 = mul float %tmp3, 2.000000e+00
+	%tmp5 = getelementptr [1000 x float]* @A, i32 0, i32 %i.019.0
+	store float %tmp4, float* %tmp5, align 4
+	%tmp8 = shl i32 %i.019.0, 1
+	%tmp9 = add i32 %tmp8, 64
+	%tmp10 = getelementptr [1000 x i32]* @P, i32 0, i32 %i.019.0
+	store i32 %tmp9, i32* %tmp10, align 4
+	%indvar.next = add i32 %i.019.0, 1
+	%exitcond = icmp eq i32 %indvar.next, %m
+	br i1 %exitcond, label %return, label %bb
+
+return:
+	ret void
+}