[llvm] r339501 - [UnJ] Improve explicit loop count checks

Sat Aug 11 00:37:32 PDT 2018

Author: dmgreen
Date: Sat Aug 11 00:37:31 2018
New Revision: 339501

URL: http://llvm.org/viewvc/llvm-project?rev=339501&view=rev
Log:
[UnJ] Improve explicit loop count checks

Try to improve the computed counts when it has been explicitly set by a pragma
or command line option. This moves the code around, so that first call to
computeUnrollCount to get a sensible count and override that if explicit unroll
and jam counts are specified.

Also added some extra debug messages for when unroll and jamming is disabled.

Differential Revision: https://reviews.llvm.org/D50075

Added:
    llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp?rev=339501&r1=339500&r2=339501&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp Sat Aug 11 00:37:31 2018
@@ -149,7 +149,26 @@ static bool computeUnrollAndJamCount(
     OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
     unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
     unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
-  // Check for explicit Count from the "unroll-and-jam-count" option.
+  // First up use computeUnrollCount from the loop unroller to get a count
+  // for unrolling the outer loop, plus any loops requiring explicit
+  // unrolling we leave to the unroller. This uses UP.Threshold /
+  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+  // We have already checked that the loop has no unroll.* pragmas.
+  unsigned MaxTripCount = 0;
+  bool UseUpperBound = false;
+  bool ExplicitUnroll = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+  if (ExplicitUnroll || UseUpperBound) {
+    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+    // for the unroller instead.
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by "
+                         "computeUnrollCount\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  // Override with any explicit Count from the "unroll-and-jam-count" option.
   bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
   if (UserUnrollCount) {
     UP.Count = UnrollAndJamCount;
@@ -174,80 +193,76 @@ static bool computeUnrollAndJamCount(
       return true;
   }
 
-  // Use computeUnrollCount from the loop unroller to get a sensible count
-  // for the unrolling the outer loop. This uses UP.Threshold /
-  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
-  // We have already checked that the loop has no unroll.* pragmas.
-  unsigned MaxTripCount = 0;
-  bool UseUpperBound = false;
-  bool ExplicitUnroll = computeUnrollCount(
-      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
-      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
-  if (ExplicitUnroll || UseUpperBound) {
-    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
-    // for the unroller instead.
-    UP.Count = 0;
-    return false;
-  }
-
   bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
-  ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
+  bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount;
+  bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount;
 
   // If the loop has an unrolling pragma, we want to be more aggressive with
   // unrolling limits.
-  if (ExplicitUnroll && OuterTripCount != 0)
+  if (ExplicitUnrollAndJam)
     UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
 
   if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
                                 UP.UnrollAndJamInnerLoopThreshold) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and "
+                         "inner loop too large\n");
     UP.Count = 0;
     return false;
   }
 
+  // We have a sensible limit for the outer loop, now adjust it for the inner
+  // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set
+  // explicitly, we want to stick to it.
+  if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) {
+    while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+                                UP.UnrollAndJamInnerLoopThreshold)
+      UP.Count--;
+  }
+
+  // If we are explicitly unroll and jamming, we are done. Otherwise there are a
+  // number of extra performance heuristics to check.
+  if (ExplicitUnrollAndJam)
+    return true;
+
   // If the inner loop count is known and small, leave the entire loop nest to
   // be the unroller
-  if (!ExplicitUnroll && InnerTripCount &&
-      InnerLoopSize * InnerTripCount < UP.Threshold) {
+  if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is "
+                         "being left for the unroller\n");
     UP.Count = 0;
     return false;
   }
 
-  // We have a sensible limit for the outer loop, now adjust it for the inner
-  // loop and UP.UnrollAndJamInnerLoopThreshold.
-  while (UP.Count != 0 && UP.AllowRemainder &&
-         getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
-             UP.UnrollAndJamInnerLoopThreshold)
-    UP.Count--;
-
-  if (!ExplicitUnroll) {
-    // Check for situations where UnJ is likely to be unprofitable. Including
-    // subloops with more than 1 block.
-    if (SubLoop->getBlocks().size() != 1) {
-      UP.Count = 0;
-      return false;
-    }
+  // Check for situations where UnJ is likely to be unprofitable. Including
+  // subloops with more than 1 block.
+  if (SubLoop->getBlocks().size() != 1) {
+    LLVM_DEBUG(
+        dbgs() << "Won't unroll-and-jam; More than one inner loop block\n");
+    UP.Count = 0;
+    return false;
+  }
 
-    // Limit to loops where there is something to gain from unrolling and
-    // jamming the loop. In this case, look for loads that are invariant in the
-    // outer loop and can become shared.
-    unsigned NumInvariant = 0;
-    for (BasicBlock *BB : SubLoop->getBlocks()) {
-      for (Instruction &I : *BB) {
-        if (auto *Ld = dyn_cast<LoadInst>(&I)) {
-          Value *V = Ld->getPointerOperand();
-          const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
-          if (SE.isLoopInvariant(LSCEV, L))
-            NumInvariant++;
-        }
+  // Limit to loops where there is something to gain from unrolling and
+  // jamming the loop. In this case, look for loads that are invariant in the
+  // outer loop and can become shared.
+  unsigned NumInvariant = 0;
+  for (BasicBlock *BB : SubLoop->getBlocks()) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        Value *V = Ld->getPointerOperand();
+        const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+        if (SE.isLoopInvariant(LSCEV, L))
+          NumInvariant++;
       }
     }
-    if (NumInvariant == 0) {
-      UP.Count = 0;
-      return false;
-    }
+  }
+  if (NumInvariant == 0) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n");
+    UP.Count = 0;
+    return false;
   }
 
-  return ExplicitUnroll;
+  return false;
 }
 
 static LoopUnrollResult

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll?rev=339501&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll Sat Aug 11 00:37:31 2018
@@ -0,0 +1,144 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: function
+; The explicit metadata here should force this to be unroll and jammed 4 times (hence the %.pre60.3)
+; CHECK: %.pre = phi i8 [ %.pre60.3, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader.new ]
+; CHECK: %indvars.iv.3 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.3, %for.body4.us ]
+define void @function(i8* noalias nocapture %dst, i32 %dst_stride, i8* noalias nocapture readonly %src, i32 %src_stride, i32 %A, i32 %B, i32 %C, i32 %D, i32 %width, i32 %height) {
+entry:
+  %idxprom = sext i32 %src_stride to i64
+  %cmp52 = icmp sgt i32 %height, 0
+  br i1 %cmp52, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp249 = icmp sgt i32 %width, 0
+  %idx.ext = sext i32 %dst_stride to i64
+  br i1 %cmp249, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
+  %.pre.pre = load i8, i8* %src, align 1
+  %wide.trip.count = zext i32 %width to i64
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %.pre = phi i8 [ %.pre60, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader ]
+  %srcp.056.us.pn = phi i8* [ %srcp.056.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src, %for.cond1.preheader.us.preheader ]
+  %y.055.us = phi i32 [ %inc30.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %dst.addr.054.us = phi i8* [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ]
+  %srcp.056.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %idxprom
+  %.pre60 = load i8, i8* %srcp.056.us, align 1
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %0 = phi i8 [ %.pre60, %for.cond1.preheader.us ], [ %3, %for.body4.us ]
+  %1 = phi i8 [ %.pre, %for.cond1.preheader.us ], [ %2, %for.body4.us ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.body4.us ]
+  %conv.us = zext i8 %1 to i32
+  %mul.us = mul nsw i32 %conv.us, %A
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx8.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %indvars.iv.next
+  %2 = load i8, i8* %arrayidx8.us, align 1
+  %conv9.us = zext i8 %2 to i32
+  %mul10.us = mul nsw i32 %conv9.us, %B
+  %conv14.us = zext i8 %0 to i32
+  %mul15.us = mul nsw i32 %conv14.us, %C
+  %arrayidx19.us = getelementptr inbounds i8, i8* %srcp.056.us, i64 %indvars.iv.next
+  %3 = load i8, i8* %arrayidx19.us, align 1
+  %conv20.us = zext i8 %3 to i32
+  %mul21.us = mul nsw i32 %conv20.us, %D
+  %add11.us = add i32 %mul.us, 32
+  %add16.us = add i32 %add11.us, %mul10.us
+  %add22.us = add i32 %add16.us, %mul15.us
+  %add23.us = add i32 %add22.us, %mul21.us
+  %4 = lshr i32 %add23.us, 6
+  %conv24.us = trunc i32 %4 to i8
+  %arrayidx26.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %indvars.iv
+  store i8 %conv24.us, i8* %arrayidx26.us, align 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext
+  %inc30.us = add nuw nsw i32 %y.055.us, 1
+  %exitcond58 = icmp eq i32 %inc30.us, %height
+  br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !5
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
+
+; CHECK-LABEL: function2
+; The explicit metadata here should force this to be unroll and jammed, but
+; the count is left to thresholds. In this case 2 (hence %.pre60.1).
+; CHECK: %.pre = phi i8 [ %.pre60.1, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader.new ]
+; CHECK: %indvars.iv.1 = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next.1, %for.body4.us ]
+define void @function2(i8* noalias nocapture %dst, i32 %dst_stride, i8* noalias nocapture readonly %src, i32 %src_stride, i32 %A, i32 %B, i32 %C, i32 %D, i32 %width, i32 %height) {
+entry:
+  %idxprom = sext i32 %src_stride to i64
+  %cmp52 = icmp sgt i32 %height, 0
+  br i1 %cmp52, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp249 = icmp sgt i32 %width, 0
+  %idx.ext = sext i32 %dst_stride to i64
+  br i1 %cmp249, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
+  %.pre.pre = load i8, i8* %src, align 1
+  %wide.trip.count = zext i32 %width to i64
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %.pre = phi i8 [ %.pre60, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %.pre.pre, %for.cond1.preheader.us.preheader ]
+  %srcp.056.us.pn = phi i8* [ %srcp.056.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src, %for.cond1.preheader.us.preheader ]
+  %y.055.us = phi i32 [ %inc30.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %dst.addr.054.us = phi i8* [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ]
+  %srcp.056.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %idxprom
+  %.pre60 = load i8, i8* %srcp.056.us, align 1
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %0 = phi i8 [ %.pre60, %for.cond1.preheader.us ], [ %3, %for.body4.us ]
+  %1 = phi i8 [ %.pre, %for.cond1.preheader.us ], [ %2, %for.body4.us ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.body4.us ]
+  %conv.us = zext i8 %1 to i32
+  %mul.us = mul nsw i32 %conv.us, %A
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx8.us = getelementptr inbounds i8, i8* %srcp.056.us.pn, i64 %indvars.iv.next
+  %2 = load i8, i8* %arrayidx8.us, align 1
+  %conv9.us = zext i8 %2 to i32
+  %mul10.us = mul nsw i32 %conv9.us, %B
+  %conv14.us = zext i8 %0 to i32
+  %mul15.us = mul nsw i32 %conv14.us, %C
+  %arrayidx19.us = getelementptr inbounds i8, i8* %srcp.056.us, i64 %indvars.iv.next
+  %3 = load i8, i8* %arrayidx19.us, align 1
+  %conv20.us = zext i8 %3 to i32
+  %mul21.us = mul nsw i32 %conv20.us, %D
+  %add11.us = add i32 %mul.us, 32
+  %add16.us = add i32 %add11.us, %mul10.us
+  %add22.us = add i32 %add16.us, %mul15.us
+  %add23.us = add i32 %add22.us, %mul21.us
+  %4 = lshr i32 %add23.us, 6
+  %conv24.us = trunc i32 %4 to i8
+  %arrayidx26.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %indvars.iv
+  store i8 %conv24.us, i8* %arrayidx26.us, align 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %add.ptr.us = getelementptr inbounds i8, i8* %dst.addr.054.us, i64 %idx.ext
+  %inc30.us = add nuw nsw i32 %y.055.us, 1
+  %exitcond58 = icmp eq i32 %inc30.us, %height
+  br i1 %exitcond58, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !7
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
+
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.unroll_and_jam.enable"}