[polly] r256199 - Add option to assume single-loop scops with sufficient compute are profitable

Mon Dec 21 13:00:44 PST 2015

Author: grosser
Date: Mon Dec 21 15:00:43 2015
New Revision: 256199

URL: http://llvm.org/viewvc/llvm-project?rev=256199&view=rev
Log:
Add option to assume single-loop scops with sufficient compute are profitable

If a loop has a sufficiently large amount of compute instruction in its loop
body, it is unlikely that our rewrite of the loop iterators introduces large
performance changes. As Polly can also apply beneficical optimizations (such
as parallelization) to such loop nests, we mark them as profitable.

This option is currently "disabled" by default, but can be used to run
experiments. If enabled by setting it e.g. to 40 instructions, we currently
see some compile-time increases on LNT without any significant run-time
changes.

Added:
    polly/trunk/test/ScopDetect/profitability-large-basic-blocks.ll
Modified:
    polly/trunk/include/polly/ScopDetection.h
    polly/trunk/lib/Analysis/ScopDetection.cpp

Modified: polly/trunk/include/polly/ScopDetection.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScopDetection.h?rev=256199&r1=256198&r2=256199&view=diff
==============================================================================

--- polly/trunk/include/polly/ScopDetection.h (original)
+++ polly/trunk/include/polly/ScopDetection.h Mon Dec 21 15:00:43 2015
@@ -287,6 +287,20 @@ private:
   /// @return True if all blocks in R are valid, false otherwise.
   bool allBlocksValid(DetectionContext &Context) const;
 
+  /// @brief Check if a region has sufficient compute instructions
+  ///
+  /// This function checks if a region has a non-trivial number of instructions
+  /// in each loop. This can be used as an indicator if a loop is worth
+  /// optimising.
+  ///
+  /// @param Context  The context of scop detection.
+  /// @param NumLoops The number of loops in the region.
+  ///
+  /// @return True if region is has sufficient compute instructions,
+  ///         false otherwise.
+  bool hasSufficientCompute(DetectionContext &Context,
+                            int NumAffineLoops) const;
+
   /// @brief Check if a region is profitable to optimize.
   ///
   /// Regions that are unlikely to expose interesting optimization opportunities

Modified: polly/trunk/lib/Analysis/ScopDetection.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Analysis/ScopDetection.cpp?rev=256199&r1=256198&r2=256199&view=diff
==============================================================================
--- polly/trunk/lib/Analysis/ScopDetection.cpp (original)
+++ polly/trunk/lib/Analysis/ScopDetection.cpp Mon Dec 21 15:00:43 2015
@@ -71,6 +71,16 @@ using namespace polly;
 
 #define DEBUG_TYPE "polly-detect"
 
+// This option is set to a very high value, as analyzing such loops increases
+// compile time on several cases. For experiments that enable this option,
+// a value of around 40 has been working to avoid run-time regressions with
+// Polly while still exposing interesting optimization opportunities.
+static cl::opt<int> ProfitabilityMinPerLoopInstructions(
+    "polly-detect-profitability-min-per-loop-insts",
+    cl::desc("The minimal number of per-loop instructions before a single loop "
+             "region is considered profitable"),
+    cl::Hidden, cl::ValueRequired, cl::init(100000000), cl::cat(PollyCategory));
+
 bool polly::PollyProcessUnprofitable;
 static cl::opt<bool, true> XPollyProcessUnprofitable(
     "polly-process-unprofitable",
@@ -1134,6 +1144,19 @@ bool ScopDetection::allBlocksValid(Detec
   return true;
 }
 
+bool ScopDetection::hasSufficientCompute(DetectionContext &Context,
+                                         int NumLoops) const {
+  int InstCount = 0;
+
+  for (auto *BB : Context.CurRegion.blocks())
+    if (Context.CurRegion.contains(LI->getLoopFor(BB)))
+      InstCount += std::distance(BB->begin(), BB->end());
+
+  InstCount = InstCount / NumLoops;
+
+  return InstCount >= ProfitabilityMinPerLoopInstructions;
+}
+
 bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
   Region &CurRegion = Context.CurRegion;
 
@@ -1145,13 +1168,24 @@ bool ScopDetection::isProfitableRegion(D
   if (!Context.hasStores || !Context.hasLoads)
     return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 
-  // Check if there are sufficent non-overapproximated loops.
   int NumLoops = countBeneficialLoops(&CurRegion);
   int NumAffineLoops = NumLoops - Context.BoxedLoopsSet.size();
-  if (NumAffineLoops < 2)
-    return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 
-  return true;
+  // Scops with at least two loops may allow either loop fusion or tiling and
+  // are consequently interesting to look at.
+  if (NumAffineLoops >= 2)
+    return true;
+
+  // Scops that contain a loop with a non-trivial amount of computation per
+  // loop-iteration are interesting as we may be able to parallelize such
+  // loops. Individual loops that have only a small amount of computation
+  // per-iteration are performance-wise very fragile as any change to the
+  // loop induction variables may affect performance. To not cause spurious
+  // performance regressions, we do not consider such loops.
+  if (NumAffineLoops == 1 && hasSufficientCompute(Context, NumLoops))
+    return true;
+
+  return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 }
 
 bool ScopDetection::isValidRegion(DetectionContext &Context) const {

Added: polly/trunk/test/ScopDetect/profitability-large-basic-blocks.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/ScopDetect/profitability-large-basic-blocks.ll?rev=256199&view=auto
==============================================================================
--- polly/trunk/test/ScopDetect/profitability-large-basic-blocks.ll (added)
+++ polly/trunk/test/ScopDetect/profitability-large-basic-blocks.ll Mon Dec 21 15:00:43 2015
@@ -0,0 +1,83 @@
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=true \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN: \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=UNPROFITABLE
+
+; UNPROFITABLE-NOT: Valid Region for Scop:
+; PROFITABLE: Valid Region for Scop:
+
+;    void foo(float *A, float *B, long N) {
+;      for (long i = 0; i < 100; i++)
+;          A[i] += .... / * a  lot of compute */
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B, i64 %N) {
+entry:
+  br label %header
+
+header:
+  %i.0 = phi i64 [ 0, %entry ], [ %tmp10, %header ]
+  %tmp5 = sitofp i64 %i.0 to float
+  %tmp6 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp7 = load float, float* %tmp6, align 4
+  %tmp8 = fadd float %tmp7, %tmp5
+  %val0 = fadd float %tmp7, 1.0
+  %val1 = fadd float %val0, 1.0
+  %val2 = fadd float %val1, 1.0
+  %val3 = fadd float %val2, 1.0
+  %val4 = fadd float %val3, 1.0
+  %val5 = fadd float %val4, 1.0
+  %val6 = fadd float %val5, 1.0
+  %val7 = fadd float %val6, 1.0
+  %val8 = fadd float %val7, 1.0
+  %val9 = fadd float %val8, 1.0
+  %val10 = fadd float %val9, 1.0
+  %val11 = fadd float %val10, 1.0
+  %val12 = fadd float %val11, 1.0
+  %val13 = fadd float %val12, 1.0
+  %val14 = fadd float %val13, 1.0
+  %val15 = fadd float %val14, 1.0
+  %val16 = fadd float %val15, 1.0
+  %val17 = fadd float %val16, 1.0
+  %val18 = fadd float %val17, 1.0
+  %val19 = fadd float %val18, 1.0
+  %val20 = fadd float %val19, 1.0
+  %val21 = fadd float %val20, 1.0
+  %val22 = fadd float %val21, 1.0
+  %val23 = fadd float %val22, 1.0
+  %val24 = fadd float %val23, 1.0
+  %val25 = fadd float %val24, 1.0
+  %val26 = fadd float %val25, 1.0
+  %val27 = fadd float %val26, 1.0
+  %val28 = fadd float %val27, 1.0
+  %val29 = fadd float %val28, 1.0
+  %val30 = fadd float %val29, 1.0
+  %val31 = fadd float %val30, 1.0
+  %val32 = fadd float %val31, 1.0
+  %val33 = fadd float %val32, 1.0
+  %val34 = fadd float %val33, 1.0
+  %val35 = fadd float %val34, 1.0
+  %val36 = fadd float %val35, 1.0
+  %val37 = fadd float %val36, 1.0
+  %val38 = fadd float %val37, 1.0
+  %val39 = fadd float %val38, 1.0
+  %val40 = fadd float %val39, 1.0
+  %val41 = fadd float %val40, 1.0
+  %val42 = fadd float %val41, 1.0
+  %val43 = fadd float %val42, 1.0
+  store float %val34, float* %tmp6, align 4
+  %exitcond = icmp ne i64 %i.0, 100
+  %tmp10 = add nsw i64 %i.0, 1
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}