[llvm] [LoopUnroll] Ignore inlinable calls if unrolling is forced (PR #88149)

Mon Nov 18 15:40:17 PST 2024

https://github.com/gburgessiv updated https://github.com/llvm/llvm-project/pull/88149

>From fdc96e4799018daaec1bbdc2d2ac214df0b7dff7 Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv at gmail.com>
Date: Mon, 15 Apr 2024 23:28:35 +0000
Subject: [PATCH] [LoopUnroll] Ignore inlinable calls if unrolling is forced

`NumInlineCandidates` counts candidates that are _very likely_ to be
inlined. This is a useful metric, but causes linker warnings if:
- the loop to be unrolled has had unrolling forced by the user, and
- the inliner fails to inline the call (e.g., because it's considered a
  very cold callsite)

Fixes #88141
---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 15 ++++-
 .../LoopUnroll/unroll-calls-if-forced.ll      | 63 +++++++++++++++++++
 2 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 953748f07af028..24f44b007d5199 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1179,9 +1179,11 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // automatic unrolling from interfering with the user requested
   // transformation.
   Loop *ParentL = L->getParentLoop();
+  const bool UnrollIsForcedByUser =
+      hasUnrollTransformation(L) == TM_ForcedByUser;
   if (ParentL != nullptr &&
       hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
-      hasUnrollTransformation(L) != TM_ForcedByUser) {
+      !UnrollIsForcedByUser) {
     LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
                       << " llvm.loop.unroll_and_jam.\n");
     return LoopUnrollResult::Unmodified;
@@ -1191,7 +1193,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
   // unrolling from interfering with the user requested transformation.
   if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
-      hasUnrollTransformation(L) != TM_ForcedByUser) {
+      !UnrollIsForcedByUser) {
     LLVM_DEBUG(
         dbgs()
         << "  Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
@@ -1240,7 +1242,14 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   if (OptForSize)
     UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
 
-  if (UCE.NumInlineCandidates != 0) {
+  // Ignore the potential for inlining if unrolling is forced by the user. It's
+  // only _very likely_ that `NumInlineCandidates` functions will be inlined;
+  // things like profile data can make this significantly less likely.
+  //
+  // TODO: It could be beneficial to allow this check in pre-link LTO, since
+  // that pipeline prefers to minimize IR size, but that requires a decent
+  // amount of extra plumbing.
+  if (!UnrollIsForcedByUser && UCE.NumInlineCandidates != 0) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll b/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll
new file mode 100644
index 00000000000000..762c0949f3e842
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/unroll-calls-if-forced.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-unroll -S < %s | FileCheck %s
+;
+; Ensure that loop unrolling occurs if the user forces it, even if there are
+; calls that may be inlined.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define internal void @foo() {
+; CHECK-LABEL: define internal void @foo() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @forced(ptr nocapture %a) {
+; CHECK-LABEL: define void @forced(
+; CHECK-SAME: ptr nocapture [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT_1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[INC_1:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[INC_1]], ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 64
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  call void @foo()
+  %0 = load i32, ptr %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.unroll.enable"},
+  !{!"llvm.loop.unroll.count", i32 2}}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.