[llvm] [LoopUnroll] Ignore inlinable calls if unrolling is forced (#88141) (PR #88149)

Fri May 10 10:42:54 PDT 2024

https://github.com/gburgessiv updated https://github.com/llvm/llvm-project/pull/88149

>From 92f35a17254488ef70221ee56394413a8dabe677 Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv at gmail.com>
Date: Mon, 15 Apr 2024 23:28:29 +0000
Subject: [PATCH 1/2] [LoopUnroll] Plumb LTO prelink information to passes

This will be needed shortly to tweak unrolling logic.
---
 .../llvm/Transforms/Scalar/LoopUnrollPass.h   | 15 +++++++++++----
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 17 ++++++++++-------
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 19 +++++++++++++------
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 8d8c2f254f02f..c32c6d06d7848 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -36,11 +36,15 @@ class LoopFullUnrollPass : public PassInfoMixin<LoopFullUnrollPass> {
   /// the internal SCEV records. For large loops, the former is faster.
   const bool ForgetSCEV;
 
+  /// If true, we're in an LTO pre-link stage.
+  const bool PrepareForLTO;
+
 public:
   explicit LoopFullUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
-                              bool ForgetSCEV = false)
+                              bool ForgetSCEV = false,
+                              bool PrepareForLTO = false)
       : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
-        ForgetSCEV(ForgetSCEV) {}
+        ForgetSCEV(ForgetSCEV), PrepareForLTO(PrepareForLTO) {}
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
@@ -77,10 +81,13 @@ struct LoopUnrollOptions {
   /// the internal SCEV records. For large loops, the former is faster.
   const bool ForgetSCEV;
 
+  /// If true, we're in an LTO pre-link stage.
+  const bool PrepareForLTO;
+
   LoopUnrollOptions(int OptLevel = 2, bool OnlyWhenForced = false,
-                    bool ForgetSCEV = false)
+                    bool ForgetSCEV = false, bool PrepareForLTO = false)
       : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
-        ForgetSCEV(ForgetSCEV) {}
+        ForgetSCEV(ForgetSCEV), PrepareForLTO(PrepareForLTO) {}
 
   /// Enables or disables partial unrolling. When disabled only full unrolling
   /// is allowed.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3bb2ce0ae3460..84f230abd1694 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -481,9 +481,10 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // attention to it.
   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
       PGOOpt->Action != PGOOptions::SampleUse)
-    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                    PTO.ForgetAllSCEVInLoopUnroll));
+    LPM2.addPass(LoopFullUnrollPass(
+        Level.getSpeedupLevel(),
+        /* OnlyWhenForced= */ !PTO.LoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll,
+        /* PrepareForLTO= */ isLTOPreLink(ThinOrFullLTOPhase::ThinLTOPreLink)));
 
   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
 
@@ -672,9 +673,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // attention to it.
   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
       PGOOpt->Action != PGOOptions::SampleUse)
-    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                    PTO.ForgetAllSCEVInLoopUnroll));
+    LPM2.addPass(LoopFullUnrollPass(
+        Level.getSpeedupLevel(),
+        /* OnlyWhenForced= */ !PTO.LoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll,
+        /* PrepareForLTO= */ isLTOPreLink(ThinOrFullLTOPhase::ThinLTOPreLink)));
 
   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
 
@@ -1951,7 +1953,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // Unroll small loops and perform peeling.
   LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
                                  /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                 PTO.ForgetAllSCEVInLoopUnroll));
+                                 PTO.ForgetAllSCEVInLoopUnroll,
+                                 /* PrepareForLTO= */ false));
   // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   MainFPM.addPass(createFunctionToLoopPassAdaptor(
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 75fb8765061ed..b1e06bf1e2931 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1140,7 +1140,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
                 std::optional<bool> ProvidedUpperBound,
                 std::optional<bool> ProvidedAllowPeeling,
                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
-                std::optional<unsigned> ProvidedFullUnrollMaxCount) {
+                std::optional<unsigned> ProvidedFullUnrollMaxCount,
+                bool PrepareForLTO) {
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
@@ -1384,6 +1385,7 @@ class LoopUnroll : public LoopPass {
   std::optional<bool> ProvidedAllowPeeling;
   std::optional<bool> ProvidedAllowProfileBasedPeeling;
   std::optional<unsigned> ProvidedFullUnrollMaxCount;
+  bool PrepareForLTO;
 
   LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
              bool ForgetAllSCEV = false,
@@ -1394,14 +1396,16 @@ class LoopUnroll : public LoopPass {
              std::optional<bool> UpperBound = std::nullopt,
              std::optional<bool> AllowPeeling = std::nullopt,
              std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
-             std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
+             std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt,
+             bool PrepareForLTO = false)
       : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
         ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
         ProvidedAllowPeeling(AllowPeeling),
         ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
-        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
+        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount),
+        PrepareForLTO(PrepareForLTO) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -1428,7 +1432,8 @@ class LoopUnroll : public LoopPass {
         /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
         ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
         ProvidedUpperBound, ProvidedAllowPeeling,
-        ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
+        ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount,
+        PrepareForLTO);
 
     if (Result == LoopUnrollResult::FullyUnrolled)
       LPM.markLoopAsDeleted(*L);
@@ -1502,7 +1507,8 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                       /*Runtime*/ false, /*UpperBound*/ false,
                       /*AllowPeeling*/ true,
                       /*AllowProfileBasedPeeling*/ false,
-                      /*FullUnrollMaxCount*/ std::nullopt) !=
+                      /*FullUnrollMaxCount*/ std::nullopt,
+                      /*PrepareForLTO*/ PrepareForLTO) !=
       LoopUnrollResult::Unmodified;
   if (!Changed)
     return PreservedAnalyses::all();
@@ -1627,7 +1633,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         /*Count*/ std::nullopt,
         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
-        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
+        UnrollOpts.PrepareForLTO);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!

>From 789592bee2664ad9b770976889f8b7c82c881c04 Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv at gmail.com>
Date: Mon, 15 Apr 2024 23:28:35 +0000
Subject: [PATCH 2/2] [LoopUnroll] Ignore inlinable calls if unrolling is
 forced

`NumInlineCandidates` counts candidates that are _very likely_ to be
inlined. This is a useful metric, but causes linker warnings if:
- the loop to be unrolled has had unrolling forced by the user, and
- the inliner fails to inline the call (e.g., because it's considered a
  very cold callsite)

Fixes #88141
---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 12 +++++--
 .../LoopUnroll/unroll-calls-if-forced.ll      | 35 +++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index b1e06bf1e2931..18007c2bdd422 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1155,9 +1155,11 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // automatic unrolling from interfering with the user requested
   // transformation.
   Loop *ParentL = L->getParentLoop();
+  const bool UnrollIsForcedByUser =
+      hasUnrollTransformation(L) == TM_ForcedByUser;
   if (ParentL != nullptr &&
       hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
-      hasUnrollTransformation(L) != TM_ForcedByUser) {
+      !UnrollIsForcedByUser) {
     LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
                       << " llvm.loop.unroll_and_jam.\n");
     return LoopUnrollResult::Unmodified;
@@ -1167,7 +1169,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
   // unrolling from interfering with the user requested transformation.
   if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
-      hasUnrollTransformation(L) != TM_ForcedByUser) {
+      !UnrollIsForcedByUser) {
     LLVM_DEBUG(
         dbgs()
         << "  Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
@@ -1217,7 +1219,11 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   if (OptForSize)
     UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
 
-  if (UCE.NumInlineCandidates != 0) {
+  // Ignore the potential for inlining if unrolling is forced by the user. It's
+  // only _very likely_ that `NumInlineCandidates` functions will be inlined;
+  // things like profile data can make this significantly less likely.
+  if ((PrepareForLTO || !UnrollIsForcedByUser) &&
+      UCE.NumInlineCandidates != 0) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll b/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll
new file mode 100644
index 0000000000000..00b774ea898d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll
@@ -0,0 +1,35 @@
+; RUN: opt -passes=loop-unroll -S < %s | FileCheck %s
+;
+; Ensure that loop unrolling occurs if the user forces it, even if there are
+; calls that may be inlined.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define internal void @foo() {
+  ret void
+}
+
+; CHECK-LABEL: @forced(
+; CHECK: call {{.*}}void @foo
+; CHECK: call {{.*}}void @foo
+define void @forced(ptr nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  call void @foo()
+  %0 = load i32, ptr %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.unroll.enable"},
+                    !{!"llvm.loop.unroll.count", i32 2}}