[llvm] [AMDGPU][TTI] Threshold bonus to loops whose unrolling makes nested loops unrollable (PR #114579)

Fri Nov 1 10:27:19 PDT 2024

https://github.com/lucas-rami created https://github.com/llvm/llvm-project/pull/114579

AMDGPU TTI's unrolling preferences disallow unrolling loops with a runtime trip count. However, it is common enough for an inner loop's bounds to only depend on an outer loop's induction variable as well as constants, for example when iterating over half of a matrix.

```cpp
for (size_t i = 0 ; i < N ; ++i) {
  for (size_t j = i ; j < N ; ++j) {
   // matrix[i][j]
  }
}
```

In such cases, unrolling the outer loop can not only be beneficial by itself but can also enable the unrolling of the (copies of the) inner loop whose trip count will have been rendered runtime-independent, yielding significant performance benefits in some cases.

This commit adds a new static bonus to the unrolling threshold of outer loops for each of their inner loop whose trip count would go from runtime-dependent to runtime-independent if the outer loop were to be unrolled. This increases the likelihood to see interesting optimization opportunities in loop nests that have the particular structure outlined above.

Fixes SWDEV-485683.

>From f56c4321413f3fd567b4044f1cc9521da65e6797 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Fri, 1 Nov 2024 18:03:14 +0100
Subject: [PATCH] Give cond. loop threshold bonus to outer loop in loop nests

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 68 ++++++++++++++-
 .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 84 +++++++++++++++++++
 2 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5160851f8c4424..79250ad1f83064 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
   cl::init(200), cl::Hidden);
 
+static cl::opt<unsigned> UnrollThresholdNestedStatic(
+    "amdgpu-unroll-threshold-nested-static",
+    cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
+             "trip count will be made runtime-independent when fully-unrolling "
+             "the outer loop"),
+    cl::init(200), cl::Hidden);
+
 static cl::opt<bool> UnrollRuntimeLocal(
   "amdgpu-unroll-runtime-local",
   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       }
     }
   }
-
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+
+  if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+    // Look for subloops whose trip count would go from runtime-dependent to
+    // runtime-independent if we were to unroll the loop. Give a bonus to the
+    // current loop's unrolling threshold for each of these, as fully unrolling
+    // it would likely expose additional optimization opportunities.
+    for (const Loop *SubLoop : L->getSubLoops()) {
+      std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+      if (!Bounds)
+        continue;
+      Value *InitIV = &Bounds->getInitialIVValue();
+      Value *FinalIV = &Bounds->getFinalIVValue();
+      Value *StepVal = Bounds->getStepValue();
+      if (!StepVal)
+        continue;
+
+      // Determines whether SubIV's derivation depends exclusively on constants
+      // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
+      // involved in the derivation.
+      bool SubIVDependsOnIV = false;
+      std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
+          [&](const Value *SubIV, unsigned Depth) -> bool {
+        if (SubIV == IV) {
+          SubIVDependsOnIV = true;
+          return true;
+        }
+        if (isa<Constant>(SubIV))
+          return true;
+        if (Depth >= 10)
+          return false;
+
+        const Instruction *I = dyn_cast<Instruction>(SubIV);
+        // No point in checking outside the loop since IV is necessarily inside
+        // it; also stop searching when encountering an instruction that will
+        // likely not allow SubIV's value to be statically computed.
+        if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
+          return false;
+
+        // SubIV depends on constants or IV if all of the instruction's
+        // operands involved in its derivation also depend on constants or IV.
+        return llvm::all_of(I->operand_values(), [&](const Value *V) {
+          return FromConstsOrLoopIV(V, Depth + 1);
+        });
+      };
+
+      if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
+          FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
+        UP.Threshold += UnrollThresholdNestedStatic;
+        LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                          << " for loop:\n"
+                          << *L
+                          << " due to subloop's trip count becoming "
+                             "runtime-independent after unrolling:\n  "
+                          << *SubLoop);
+        if (UP.Threshold >= MaxBoost)
+          return;
+      }
+    }
+  }
+
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getDataLayout();
     unsigned LocalGEPsSeen = 0;
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
new file mode 100644
index 00000000000000..36101c50db98ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -0,0 +1,84 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+
+; For @dependent_sub_fullunroll, the threshold bonus should apply
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
+; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
+; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+
+; Check that the outer loop of a double-nested loop where the inner loop's trip
+; count depends exclusively on constants and the outer IV is fully unrolled
+; thanks to receiving a threshold bonus in AMDGPU's TTI.
+
+; CHECK-LABEL: @dependent_sub_fullunroll
+; CHECK: inner.header_latch_exiting.7
+; CHECK: outer.latch_exiting.7
+
+define void @dependent_sub_fullunroll(ptr noundef %mem) {
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, 8
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+  
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
+; is not fully unrolled when the inner loop's final IV value depends on a
+; function argument instead of a combination of the outer IV and constants.
+
+; CHECK-LABEL: @dependent_sub_no_fullunroll
+; CHECK-NOT: outer.latch_exiting.7
+; CHECK-NOT: outer.latch_exiting.7
+
+define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+  
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}