[llvm] [AMDGPU][TTI] Threshold bonus to loops whose unrolling makes nested loops unrollable (PR #114579)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 06:19:06 PST 2024
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/114579
>From f56c4321413f3fd567b4044f1cc9521da65e6797 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Fri, 1 Nov 2024 18:03:14 +0100
Subject: [PATCH 1/3] Give cond. loop threshold bonus to outer loop in loop
nests
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 68 ++++++++++++++-
.../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 84 +++++++++++++++++++
2 files changed, 151 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5160851f8c4424..79250ad1f83064 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(200), cl::Hidden);
+static cl::opt<unsigned> UnrollThresholdNestedStatic(
+ "amdgpu-unroll-threshold-nested-static",
+ cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
+ "trip count will be made runtime-independent when fully-unrolling "
+ "the outer loop"),
+ cl::init(200), cl::Hidden);
+
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
}
-
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+
+ if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+ // Look for subloops whose trip count would go from runtime-dependent to
+ // runtime-independent if we were to unroll the loop. Give a bonus to the
+ // current loop's unrolling threshold for each of these, as fully unrolling
+ // it would likely expose additional optimization opportunities.
+ for (const Loop *SubLoop : L->getSubLoops()) {
+ std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+ if (!Bounds)
+ continue;
+ Value *InitIV = &Bounds->getInitialIVValue();
+ Value *FinalIV = &Bounds->getFinalIVValue();
+ Value *StepVal = Bounds->getStepValue();
+ if (!StepVal)
+ continue;
+
+ // Determines whether SubIV's derivation depends exclusively on constants
+ // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
+ // involved in the derivation.
+ bool SubIVDependsOnIV = false;
+ std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
+ [&](const Value *SubIV, unsigned Depth) -> bool {
+ if (SubIV == IV) {
+ SubIVDependsOnIV = true;
+ return true;
+ }
+ if (isa<Constant>(SubIV))
+ return true;
+ if (Depth >= 10)
+ return false;
+
+ const Instruction *I = dyn_cast<Instruction>(SubIV);
+ // No point in checking outside the loop since IV is necessarily inside
+ // it; also stop searching when encountering an instruction that will
+ // likely not allow SubIV's value to be statically computed.
+ if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
+ return false;
+
+ // SubIV depends on constants or IV if all of the instruction's
+ // operands involved in its derivation also depend on constants or IV.
+ return llvm::all_of(I->operand_values(), [&](const Value *V) {
+ return FromConstsOrLoopIV(V, Depth + 1);
+ });
+ };
+
+ if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
+ FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
+ UP.Threshold += UnrollThresholdNestedStatic;
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+ << " for loop:\n"
+ << *L
+ << " due to subloop's trip count becoming "
+ "runtime-independent after unrolling:\n "
+ << *SubLoop);
+ if (UP.Threshold >= MaxBoost)
+ return;
+ }
+ }
+ }
+
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getDataLayout();
unsigned LocalGEPsSeen = 0;
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
new file mode 100644
index 00000000000000..36101c50db98ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -0,0 +1,84 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+
+; For @dependent_sub_fullunroll, the threshold bonus should apply
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
+; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
+; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+
+; Check that the outer loop of a double-nested loop where the inner loop's trip
+; count depends exclusively on constants and the outer IV is fully unrolled
+; thanks to receiving a threshold bonus in AMDGPU's TTI.
+
+; CHECK-LABEL: @dependent_sub_fullunroll
+; CHECK: inner.header_latch_exiting.7
+; CHECK: outer.latch_exiting.7
+
+define void @dependent_sub_fullunroll(ptr noundef %mem) {
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, 8
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 8
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
+; is not fully unrolled when the inner loop's final IV value depends on a
+; function argument instead of a combination of the outer IV and constants.
+
+; CHECK-LABEL: @dependent_sub_no_fullunroll
+; CHECK-NOT: outer.latch_exiting.7
+; CHECK-NOT: outer.latch_exiting.7
+
+define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 8
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}
>From 0a72dca4142ab896b39bd89f3c5fdab4e1ed6bd8 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Mon, 4 Nov 2024 11:53:43 +0100
Subject: [PATCH 2/3] Address reviewers' comments
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +-
.../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 183 ++++++++++++++++--
2 files changed, 167 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 79250ad1f83064..8d6eb94af4a108 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -157,7 +157,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
- if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+ if (PHINode *IV = L->getInductionVariable(SE)) {
// Look for subloops whose trip count would go from runtime-dependent to
// runtime-independent if we were to unroll the loop. Give a bonus to the
// current loop's unrolling threshold for each of these, as fully unrolling
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
index 36101c50db98ac..97de4cbf0936c6 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s
; For @dependent_sub_fullunroll, the threshold bonus should apply
; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
@@ -6,15 +8,63 @@
; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
; Check that the outer loop of a double-nested loop where the inner loop's trip
; count depends exclusively on constants and the outer IV is fully unrolled
; thanks to receiving a threshold bonus in AMDGPU's TTI.
-; CHECK-LABEL: @dependent_sub_fullunroll
-; CHECK: inner.header_latch_exiting.7
-; CHECK: outer.latch_exiting.7
-
define void @dependent_sub_fullunroll(ptr noundef %mem) {
+; CHECK-LABEL: @dependent_sub_fullunroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_2:%.*]]
+; CHECK: inner.header_latch_exiting.2:
+; CHECK-NEXT: [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64
+; CHECK-NEXT: [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]]
+; CHECK-NEXT: [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_2]], align 4
+; CHECK-NEXT: [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.2:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_3:%.*]]
+; CHECK: inner.header_latch_exiting.3:
+; CHECK-NEXT: [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64
+; CHECK-NEXT: [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]]
+; CHECK-NEXT: [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_3]], align 4
+; CHECK-NEXT: [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.3:
+; CHECK-NEXT: ret void
+;
entry:
br label %outer.header
@@ -26,9 +76,9 @@ inner.header_latch_exiting: ; preds = %outer.h
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, 8
@@ -36,9 +86,9 @@ inner.header_latch_exiting: ; preds = %outer.h
outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 8
+ %outer.cond = icmp ult i32 %outer.iv_next, 4
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
+
end: ; preds = %outer.latch_exiting
ret void
}
@@ -47,11 +97,45 @@ end: ; preds = %outer.l
; is not fully unrolled when the inner loop's final IV value depends on a
; function argument instead of a combination of the outer IV and constants.
-; CHECK-LABEL: @dependent_sub_no_fullunroll
-; CHECK-NOT: outer.latch_exiting.7
-; CHECK-NOT: outer.latch_exiting.7
-
define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @dependent_sub_no_fullunroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ]
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64
+; CHECK-NEXT: [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2
+; CHECK-NEXT: [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4
+; CHECK-NEXT: br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
entry:
br label %outer.header
@@ -63,9 +147,9 @@ inner.header_latch_exiting: ; preds = %outer.h
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
@@ -73,9 +157,74 @@ inner.header_latch_exiting: ; preds = %outer.h
outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 8
+ %outer.cond = icmp ult i32 %outer.iv_next, 4
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; Make sure that the threshold bonus does not override a correctness check and
+; unrolling when a convergent operation that is illegal to unroll is present.
+; The loop nest is the same as before except for the fact that the outer
+; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled
+; convergent call in the outer loop's header. Were the call non-convergent,
+; the outer loop would be partially unrolled by a factor of 2, with a breakout
+; of 1.
+
+declare void @convergent_operation() convergent
+
+define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) {
+; CHECK-LABEL: @dont_unroll_illegal_convergent_op(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
+; CHECK-NEXT: call void @convergent_operation()
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11
+; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ call void @convergent_operation()
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, 8
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 11
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
end: ; preds = %outer.latch_exiting
ret void
}
>From 30a482719ea3156263e66f1aefe62afa8489abd7 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 19 Dec 2024 15:18:11 +0100
Subject: [PATCH 3/3] Moved logic to target-independent analysis and improved
it
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 68 +---
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 303 +++++++++++++--
.../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 233 ------------
...mplete_unroll_profitability_with_assume.ll | 46 ++-
.../LoopUnroll/full-unroll-cost-savings.ll | 354 ++++++++++++++++++
5 files changed, 665 insertions(+), 339 deletions(-)
delete mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
create mode 100644 llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 8d6eb94af4a108..5160851f8c4424 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -47,13 +47,6 @@ static cl::opt<unsigned> UnrollThresholdIf(
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(200), cl::Hidden);
-static cl::opt<unsigned> UnrollThresholdNestedStatic(
- "amdgpu-unroll-threshold-nested-static",
- cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
- "trip count will be made runtime-independent when fully-unrolling "
- "the outer loop"),
- cl::init(200), cl::Hidden);
-
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -155,67 +148,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
}
- unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
-
- if (PHINode *IV = L->getInductionVariable(SE)) {
- // Look for subloops whose trip count would go from runtime-dependent to
- // runtime-independent if we were to unroll the loop. Give a bonus to the
- // current loop's unrolling threshold for each of these, as fully unrolling
- // it would likely expose additional optimization opportunities.
- for (const Loop *SubLoop : L->getSubLoops()) {
- std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
- if (!Bounds)
- continue;
- Value *InitIV = &Bounds->getInitialIVValue();
- Value *FinalIV = &Bounds->getFinalIVValue();
- Value *StepVal = Bounds->getStepValue();
- if (!StepVal)
- continue;
-
- // Determines whether SubIV's derivation depends exclusively on constants
- // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
- // involved in the derivation.
- bool SubIVDependsOnIV = false;
- std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
- [&](const Value *SubIV, unsigned Depth) -> bool {
- if (SubIV == IV) {
- SubIVDependsOnIV = true;
- return true;
- }
- if (isa<Constant>(SubIV))
- return true;
- if (Depth >= 10)
- return false;
-
- const Instruction *I = dyn_cast<Instruction>(SubIV);
- // No point in checking outside the loop since IV is necessarily inside
- // it; also stop searching when encountering an instruction that will
- // likely not allow SubIV's value to be statically computed.
- if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
- return false;
-
- // SubIV depends on constants or IV if all of the instruction's
- // operands involved in its derivation also depend on constants or IV.
- return llvm::all_of(I->operand_values(), [&](const Value *V) {
- return FromConstsOrLoopIV(V, Depth + 1);
- });
- };
-
- if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
- FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
- UP.Threshold += UnrollThresholdNestedStatic;
- LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
- << " for loop:\n"
- << *L
- << " due to subloop's trip count becoming "
- "runtime-independent after unrolling:\n "
- << *SubLoop);
- if (UP.Threshold >= MaxBoost)
- return;
- }
- }
- }
+ unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getDataLayout();
unsigned LocalGEPsSeen = 0;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index cbc35b6dd4292a..a4bcc2d9e7efa6 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -85,9 +85,9 @@ static cl::opt<unsigned>
static cl::opt<unsigned>
UnrollOptSizeThreshold(
- "unroll-optsize-threshold", cl::init(0), cl::Hidden,
- cl::desc("The cost threshold for loop unrolling when optimizing for "
- "size"));
+ "unroll-optsize-threshold", cl::init(0), cl::Hidden,
+ cl::desc("The cost threshold for loop unrolling when optimizing for "
+ "size"));
static cl::opt<unsigned> UnrollPartialThreshold(
"unroll-partial-threshold", cl::Hidden,
@@ -154,7 +154,7 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
static cl::opt<bool> UnrollUnrollRemainder(
"unroll-remainder", cl::Hidden,
- cl::desc("Allow the loop remainder to be unrolled."));
+ cl::desc("Allow the loop remainder to be unrolled."));
// This option isn't ever intended to be enabled, it serves to allow
// experiments to check the assumptions about when this kind of revisit is
@@ -337,8 +337,239 @@ struct PragmaInfo {
const bool PragmaEnableUnroll;
};
+/// Helper type to estimate per-iteration cost savings coming from fully
+/// unrolling a loop.
+///
+/// The analysis maintains a set of "known instructions" inside the loop (i.e.,
+/// instructions whose result will be statically known after loop unrolling)
+/// that we assume will be entirely removable if the loop is fully unrolled.
+/// These instructions' cost can be deducted from the unrolled cost when
+/// comparing against a threshold.
+struct FullUnrollCostSavings {
+ FullUnrollCostSavings(const Loop *L) : L(L) {}
+
+ /// Returns whether the instruction is known.
+ inline bool isKnown(const Instruction *I) const {
+ return KnownVals.contains(I);
+ }
+
+ /// If the value is an instruction, returns whether that instruction is known,
+ /// false otherwise.
+ bool isKnown(const Value *V) const {
+ if (const Instruction *I = dyn_cast<Instruction>(V))
+ return isKnown(I);
+ return false;
+ }
+
+ /// Adds an instruction to the known set and re-evaluates unknown instructions
+ /// in the loop to determine whether their result can now be known.
+ void addToKnown(const Instruction *I) {
+ if (!KnownVals.insert(I).second)
+ return;
+
+ // Every time we assume knowledge of an additional instruction result, we
+ // potentially need to revisit instructions that were previously seen as
+ // unoptimizable.
+ Evaluated.clear();
+
+ addUsersToExploreSet(I);
+ while (ToEvaluate.size()) {
+ const Instruction *I = ToEvaluate.back();
+ ToEvaluate.pop_back();
+ evalInstruction(I);
+ }
+ }
+
+ /// Returns savings incurred by all known instructions, according to the \p
+ /// TTI.
+ InstructionCost computeSavings(const TargetTransformInfo &TTI) const {
+ TargetTransformInfo::TargetCostKind CostKind =
+ L->getHeader()->getParent()->hasMinSize()
+ ? TargetTransformInfo::TCK_CodeSize
+ : TargetTransformInfo::TCK_SizeAndLatency;
+
+ InstructionCost CostSavings;
+ for (const Value *Val : KnownVals)
+ CostSavings += TTI.getInstructionCost(cast<Instruction>(Val), CostKind);
+ return CostSavings;
+ }
+
+private:
+ /// The set of instruction inside the loop whose results are considered known.
+ SmallPtrSet<const Instruction *, 4> KnownVals;
+ /// Caches the set of instructions we have already evaluated when adding a new
+ /// instruction to the known set.
+ SmallPtrSet<const Instruction *, 4> Evaluated;
+ /// Stack of instructions to evaluate when adding a new instruction to the
+ /// known set.
+ SmallVector<const Instruction *, 4> ToEvaluate;
+ /// The loop under consideration.
+ const Loop *L;
+
+ /// Adds all value users to the stack of instructions to evaluate, if they
+ /// have not been evaluated already.
+ void addUsersToExploreSet(const Value *Val) {
+ for (const User *U : Val->users()) {
+ if (const Instruction *I = dyn_cast<Instruction>(U))
+ if (!Evaluated.contains(I))
+ ToEvaluate.push_back(I);
+ }
+ }
+
+ /// Evaluates an instruction to determine whether its result is "known", and
+ /// returns if that is the case. This may recurse on operands that are the
+ /// resul of yet unevaluated instructions inside the loop.
+ bool evalInstruction(const Instruction *I) {
+ Evaluated.insert(I);
+ if (isKnown(I))
+ return true;
+ if (!isa<BinaryOperator, CastInst, CmpInst>(I))
+ return false;
+ bool Known = llvm::all_of(I->operand_values(), [&](const Value *Val) {
+ if (isa<Constant>(Val) || isKnown(Val))
+ return true;
+ const Instruction *ValInstr = dyn_cast<Instruction>(Val);
+ if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr))
+ return false;
+ return evalInstruction(ValInstr);
+ });
+ if (Known) {
+ KnownVals.insert(I);
+ addUsersToExploreSet(I);
+ }
+ return Known;
+ }
+};
+
} // end anonymous namespace
+/// Runs a fast analysis on the loop to determine whether it is worth it to
+/// fully unroll it. As opposed to analyzeLoopUnrollCost, this does not attempt
+/// to simulate execution of every loop iteration but instead tries to identify
+/// the set of instructions that will be optimizable away if the loop is fully
+/// unrolled. Returns estimated instruction cost savings per loop iteration if
+/// the loop were to be fully unrolled according to the trip count in UP.Count.
+static InstructionCost analyzeFullUnrollCostSavings(
+ const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ const TargetTransformInfo::UnrollingPreferences &UP) {
+ // Cost savings analysis is all based on unrolling making some values
+ // statically known; if we cannot identify the loop's IV then there is nothing
+ // we can do.
+ PHINode *IV = L->getInductionVariable(SE);
+ if (!IV)
+ return {};
+ FullUnrollCostSavings Savings(L);
+
+ // If we were to unroll the loop, everything that is only dependent on the IV
+ // and constants will get simplified away.
+ Savings.addToKnown(IV);
+
+ // Look for subloops whose trip count would go from runtime-dependent to
+ // runtime-independent if we were to unroll the loop. These subloops are
+ // likely to be fully unrollable in the future and yield further cost savings.
+ unsigned NumUnrollableSubloops = 0;
+ for (const Loop *SubLoop : L->getSubLoops()) {
+ // We must be able to determine the loop's IV, initial/final IV value, and
+ // step.
+ PHINode *SubIV = SubLoop->getInductionVariable(SE);
+ if (!SubIV)
+ continue;
+ std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+ if (!Bounds)
+ continue;
+ Value *StepVal = Bounds->getStepValue();
+ if (!StepVal)
+ continue;
+
+ bool SubBoundsDependsOnIV = false;
+ auto IsValKnown = [&](const Value *Val) -> bool {
+ if (isa<Constant>(Val))
+ return true;
+ if (Savings.isKnown(Val)) {
+ SubBoundsDependsOnIV = true;
+ return true;
+ }
+ return false;
+ };
+
+ // Determine whether the derivation of the subloop's bounds depends
+ // exclusively on constants and the outer loop's IV.
+ if (IsValKnown(&Bounds->getInitialIVValue()) &&
+ IsValKnown(&Bounds->getFinalIVValue()) && IsValKnown(StepVal) &&
+ SubBoundsDependsOnIV) {
+ // Optimistically assume that we will be able to unroll the subloop in the
+ // future, which means that its IV will also be known on all inner loop
+ // iterations, leading to more instructions being optimized away. Properly
+ // estimating the cost savings per outer loop iteration would require us
+ // to estimate the average subloop trip count, but it is too complicated
+ // for this analysis. When determining cost savings, we will very
+ // conservatively assume that the inner loop will only execute once per
+ // outer loop iteration. This also reduces our cost savings estimation
+ // mistake in the case where the subloop does not end up being unrolled.
+ Savings.addToKnown(SubIV);
+ ++NumUnrollableSubloops;
+
+ LLVM_DEBUG(
+ dbgs() << " Trip count of subloop %"
+ << SubLoop->getHeader()->getName()
+ << " will become runtime-independent by fully unrolling loop %"
+ << L->getHeader()->getName() << "\n");
+ }
+ }
+
+ // Look for condititional branches whose condition would be statically
+ // determined at each iteration of the loop if it were unrolled. In some
+ // cases, this means we will able to remove the branch entirely.
+ for (const BasicBlock *BB : L->getBlocks()) {
+ const Instruction *TermInstr = BB->getTerminator();
+ if (const BranchInst *Br = dyn_cast<BranchInst>(TermInstr)) {
+ if (Br->isConditional() && Savings.isKnown(Br->getCondition())) {
+ // The branch condition will be statically determined at each iteration
+ // of the loop.
+ BasicBlock *FalseSucc = Br->getSuccessor(0),
+ *TrueSucc = Br->getSuccessor(1);
+
+ // Checks whether one of the branch successor has at most two
+ // predecessors which are either the branch's block or the other branch
+ // successor.
+ auto IsIfThen = [&](auto Predecessors, BasicBlock *OtherSucc) -> bool {
+ unsigned NumPreds = 0;
+ for (const BasicBlock *Pred : Predecessors) {
+ if (Pred != BB && Pred != OtherSucc)
+ return false;
+ if (++NumPreds > 2)
+ return false;
+ }
+ return true;
+ };
+
+ if ((TrueSucc->getSinglePredecessor() ||
+ IsIfThen(predecessors(TrueSucc), FalseSucc)) &&
+ (FalseSucc->getSinglePredecessor() ||
+ IsIfThen(predecessors(FalseSucc), TrueSucc))) {
+ // The CFG corresponds to a simple if/then(/else) construct whose
+ // condition we will know, so we will able to remove the branch and
+ // one of the two blocks at each iteration of the outer loop. Only the
+ // branch represents a cost saving, since one successor block will
+ // still be executed.
+ Savings.addToKnown(Br);
+ LLVM_DEBUG(dbgs() << " Conditional branch will be removed by fully "
+ "unrolling loop %"
+ << L->getHeader()->getName() << "\n");
+ }
+ }
+ }
+ }
+
+ // Compute cost savings from instructions that will likely be optimized away
+ // by unrolling the loop.
+ InstructionCost CostSavings = Savings.computeSavings(TTI);
+ // Finally, for each subloop that we think will become unrollable, account for
+ // the backedge's branch being removed.
+ CostSavings += NumUnrollableSubloops;
+ return CostSavings;
+}
+
/// Figure out if the loop is worth full unrolling.
///
/// Complete loop unrolling can make some loads constant, and we need to know
@@ -833,34 +1064,54 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
return std::nullopt;
}
-static std::optional<unsigned> shouldFullUnroll(
- Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
- ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
- const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
- const TargetTransformInfo::UnrollingPreferences &UP) {
- assert(FullUnrollTripCount && "should be non-zero!");
+static bool
+shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+ ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ const UnrollCostEstimator UCE,
+ const TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(UP.Count && "should be non-zero!");
- if (FullUnrollTripCount > UP.FullUnrollMaxCount)
- return std::nullopt;
+ if (UP.Count > UP.FullUnrollMaxCount)
+ return false;
// When computing the unrolled size, note that BEInsns are not replicated
// like the rest of the loop body.
if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
- return FullUnrollTripCount;
+ return true;
// The loop isn't that small, but we still can fully unroll it if that
- // helps to remove a significant number of instructions.
- // To check that, run additional analysis on the loop.
+ // helps to remove a significant number of instructions. To check that, run
+ // additional analyses on the loop. First try a full iteration-by-iteration
+ // analysis on the loop. If that fails, run a simpler structural analysis that
+ // estimates per-iteration cost savings in the unrolled loop.
if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+ L, UP.Count, DT, SE, EphValues, TTI,
UP.Threshold * UP.MaxPercentThresholdBoost / 100,
UP.MaxIterationsCountToAnalyze)) {
unsigned Boost =
- getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+ getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
- return FullUnrollTripCount;
+ return true;
+ } else {
+ InstructionCost Savings = analyzeFullUnrollCostSavings(L, SE, TTI, UP);
+ if (!(Savings.isValid() && *Savings.getValue()))
+ return false;
+ // Savings for one loop iteration are those estimated by the analaysis plus
+ // the loop backedge's branch.
+ uint64_t ItSavings = *Savings.getValue() + 1;
+ // Compute estimated cost of one loop iteration in the unrolled form.
+ uint64_t ItUnrollCost = UCE.getRolledLoopSize();
+ if (ItSavings < ItUnrollCost)
+ ItUnrollCost -= ItSavings;
+ else
+ ItUnrollCost = 1;
+ uint64_t FullUnrollCost = ItUnrollCost * UP.Count + 1;
+ assert(FullUnrollCost && "loop has no cost");
+ if (FullUnrollCost < UP.Threshold)
+ return true;
}
- return std::nullopt;
+ return false;
}
static std::optional<unsigned>
@@ -873,7 +1124,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
if (!UP.Partial) {
LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
- << "-unroll-allow-partial not given\n");
+ << "-unroll-allow-partial not given\n");
return 0;
}
unsigned count = UP.Count;
@@ -883,7 +1134,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
// Reduce unroll count to be modulo of TripCount for partial unrolling.
if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
- (LoopSize - UP.BEInsns);
+ (LoopSize - UP.BEInsns);
if (count > UP.MaxCount)
count = UP.MaxCount;
while (count != 0 && TripCount % count != 0)
@@ -980,9 +1231,7 @@ bool llvm::computeUnrollCount(
UP.Count = 0;
if (TripCount) {
UP.Count = TripCount;
- if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
- TripCount, UCE, UP)) {
- UP.Count = *UnrollFactor;
+ if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) {
UseUpperBound = false;
return ExplicitUnroll;
}
@@ -1003,9 +1252,7 @@ bool llvm::computeUnrollCount(
if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
MaxTripCount <= UP.MaxUpperBound) {
UP.Count = MaxTripCount;
- if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
- MaxTripCount, UCE, UP)) {
- UP.Count = *UnrollFactor;
+ if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) {
UseUpperBound = true;
return ExplicitUnroll;
}
@@ -1533,7 +1780,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
if (!Changed)
return PreservedAnalyses::all();
- // The parent must not be damaged by unrolling!
+ // The parent must not be damaged by unrolling!
#ifndef NDEBUG
if (ParentL)
ParentL->verifyLoop();
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
deleted file mode 100644
index 97de4cbf0936c6..00000000000000
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; REQUIRES: asserts
-; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s
-
-; For @dependent_sub_fullunroll, the threshold bonus should apply
-; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
-
-; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
-; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
-
-; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling
-; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
-
-; Check that the outer loop of a double-nested loop where the inner loop's trip
-; count depends exclusively on constants and the outer IV is fully unrolled
-; thanks to receiving a threshold bonus in AMDGPU's TTI.
-
-define void @dependent_sub_fullunroll(ptr noundef %mem) {
-; CHECK-LABEL: @dependent_sub_fullunroll(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
-; CHECK: outer.header:
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
-; CHECK: inner.header_latch_exiting:
-; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
-; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
-; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
-; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: outer.latch_exiting:
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
-; CHECK: inner.header_latch_exiting.1:
-; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
-; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
-; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
-; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
-; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8
-; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting.1:
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_2:%.*]]
-; CHECK: inner.header_latch_exiting.2:
-; CHECK-NEXT: [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1
-; CHECK-NEXT: [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64
-; CHECK-NEXT: [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]]
-; CHECK-NEXT: [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR_2]], align 4
-; CHECK-NEXT: [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8
-; CHECK-NEXT: br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting.2:
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_3:%.*]]
-; CHECK: inner.header_latch_exiting.3:
-; CHECK-NEXT: [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1
-; CHECK-NEXT: [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64
-; CHECK-NEXT: [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]]
-; CHECK-NEXT: [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR_3]], align 4
-; CHECK-NEXT: [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8
-; CHECK-NEXT: br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting.3:
-; CHECK-NEXT: ret void
-;
-entry:
- br label %outer.header
-
-outer.header: ; preds = %entry, %outer.latch_exiting
- %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
- br label %inner.header_latch_exiting
-
-inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
- %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
- %inner.iv_next = add nuw nsw i32 %inner.iv, 1
- %outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
- %inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
- %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
- store i32 0, ptr %addr
- %inner.cond = icmp ult i32 %inner.iv_next, 8
- br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
-
-outer.latch_exiting: ; preds = %inner.header_latch_exiting
- %outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 4
- br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
-end: ; preds = %outer.latch_exiting
- ret void
-}
-
-; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
-; is not fully unrolled when the inner loop's final IV value depends on a
-; function argument instead of a combination of the outer IV and constants.
-
-define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
-; CHECK-LABEL: @dependent_sub_no_fullunroll(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
-; CHECK: outer.header:
-; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ]
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
-; CHECK: inner.header_latch_exiting:
-; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
-; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
-; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
-; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
-; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
-; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
-; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting:
-; CHECK-NEXT: [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
-; CHECK: inner.header_latch_exiting.1:
-; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
-; CHECK-NEXT: [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64
-; CHECK-NEXT: [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16
-; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
-; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]]
-; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
-; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
-; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting.1:
-; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2
-; CHECK-NEXT: [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4
-; CHECK-NEXT: br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: end:
-; CHECK-NEXT: ret void
-;
-entry:
- br label %outer.header
-
-outer.header: ; preds = %entry, %outer.latch_exiting
- %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
- br label %inner.header_latch_exiting
-
-inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
- %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
- %inner.iv_next = add nuw nsw i32 %inner.iv, 1
- %outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
- %inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
- %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
- store i32 0, ptr %addr
- %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
- br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
-
-outer.latch_exiting: ; preds = %inner.header_latch_exiting
- %outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 4
- br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
-end: ; preds = %outer.latch_exiting
- ret void
-}
-
-; Make sure that the threshold bonus does not override a correctness check and
-; unrolling when a convergent operation that is illegal to unroll is present.
-; The loop nest is the same as before except for the fact that the outer
-; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled
-; convergent call in the outer loop's header. Were the call non-convergent,
-; the outer loop would be partially unrolled by a factor of 2, with a breakout
-; of 1.
-
-declare void @convergent_operation() convergent
-
-define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) {
-; CHECK-LABEL: @dont_unroll_illegal_convergent_op(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
-; CHECK: outer.header:
-; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
-; CHECK-NEXT: call void @convergent_operation()
-; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
-; CHECK: inner.header_latch_exiting:
-; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
-; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
-; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
-; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
-; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
-; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
-; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
-; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
-; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]]
-; CHECK: outer.latch_exiting:
-; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
-; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11
-; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
-; CHECK: end:
-; CHECK-NEXT: ret void
-;
-entry:
- br label %outer.header
-
-outer.header: ; preds = %entry, %outer.latch_exiting
- %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
- call void @convergent_operation()
- br label %inner.header_latch_exiting
-
-inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
- %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
- %inner.iv_next = add nuw nsw i32 %inner.iv, 1
- %outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
- %inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
- %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
- store i32 0, ptr %addr
- %inner.cond = icmp ult i32 %inner.iv_next, 8
- br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
-
-outer.latch_exiting: ; preds = %inner.header_latch_exiting
- %outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 11
- br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
-end: ; preds = %outer.latch_exiting
- ret void
-}
-
-!1 = !{!1, !2}
-!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}
diff --git a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
index 556a4032b58e4e..8f4f71abf37a93 100644
--- a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
+++ b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
@@ -22,55 +22,73 @@ define i32 @foo(ptr %a) {
; ANALYZE-FULL: for.body:
; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE:%.*]], label [[FOR_NEXT:%.*]]
; ANALYZE-FULL: do_store:
-; ANALYZE-FULL-NEXT: store i32 0, ptr [[A:%.*]], align 4
+; ANALYZE-FULL-NEXT: [[DATA:%.*]] = load i32, ptr [[A:%.*]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL]], ptr [[A]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT]]
; ANALYZE-FULL: for.next:
; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_1:%.*]], label [[FOR_NEXT_1:%.*]]
; ANALYZE-FULL: do_store.1:
; ANALYZE-FULL-NEXT: [[GEP_1:%.*]] = getelementptr i32, ptr [[A]], i32 1
-; ANALYZE-FULL-NEXT: store i32 1, ptr [[GEP_1]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_1:%.*]] = mul i32 [[DATA_1]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_1]], ptr [[GEP_1]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_1]]
; ANALYZE-FULL: for.next.1:
; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_2:%.*]], label [[FOR_NEXT_2:%.*]]
; ANALYZE-FULL: do_store.2:
; ANALYZE-FULL-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr [[A]], i32 2
-; ANALYZE-FULL-NEXT: store i32 2, ptr [[GEP_2]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_2:%.*]] = mul i32 [[DATA_2]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_2]], ptr [[GEP_2]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_2]]
; ANALYZE-FULL: for.next.2:
; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_3:%.*]], label [[FOR_NEXT_3:%.*]]
; ANALYZE-FULL: do_store.3:
; ANALYZE-FULL-NEXT: [[GEP_3:%.*]] = getelementptr i32, ptr [[A]], i32 3
-; ANALYZE-FULL-NEXT: store i32 3, ptr [[GEP_3]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_3:%.*]] = load i32, ptr [[GEP_3]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_3:%.*]] = mul i32 [[DATA_3]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_3]], ptr [[GEP_3]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_3]]
; ANALYZE-FULL: for.next.3:
; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_4:%.*]], label [[FOR_NEXT_4:%.*]]
; ANALYZE-FULL: do_store.4:
; ANALYZE-FULL-NEXT: [[GEP_4:%.*]] = getelementptr i32, ptr [[A]], i32 4
-; ANALYZE-FULL-NEXT: store i32 4, ptr [[GEP_4]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_4:%.*]] = load i32, ptr [[GEP_4]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_4:%.*]] = mul i32 [[DATA_4]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_4]], ptr [[GEP_4]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_4]]
; ANALYZE-FULL: for.next.4:
; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_5:%.*]], label [[FOR_NEXT_5:%.*]]
; ANALYZE-FULL: do_store.5:
; ANALYZE-FULL-NEXT: [[GEP_5:%.*]] = getelementptr i32, ptr [[A]], i32 5
-; ANALYZE-FULL-NEXT: store i32 5, ptr [[GEP_5]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_5:%.*]] = load i32, ptr [[GEP_5]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_5:%.*]] = mul i32 [[DATA_5]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_5]], ptr [[GEP_5]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_5]]
; ANALYZE-FULL: for.next.5:
; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_6:%.*]], label [[FOR_NEXT_6:%.*]]
; ANALYZE-FULL: do_store.6:
; ANALYZE-FULL-NEXT: [[GEP_6:%.*]] = getelementptr i32, ptr [[A]], i32 6
-; ANALYZE-FULL-NEXT: store i32 6, ptr [[GEP_6]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_6:%.*]] = load i32, ptr [[GEP_6]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_6:%.*]] = mul i32 [[DATA_6]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_6]], ptr [[GEP_6]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_6]]
; ANALYZE-FULL: for.next.6:
; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_7:%.*]], label [[FOR_NEXT_7:%.*]]
; ANALYZE-FULL: do_store.7:
; ANALYZE-FULL-NEXT: [[GEP_7:%.*]] = getelementptr i32, ptr [[A]], i32 7
-; ANALYZE-FULL-NEXT: store i32 7, ptr [[GEP_7]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_7:%.*]] = load i32, ptr [[GEP_7]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_7:%.*]] = mul i32 [[DATA_7]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_7]], ptr [[GEP_7]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_7]]
; ANALYZE-FULL: for.next.7:
; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_8:%.*]], label [[FOR_NEXT_8:%.*]]
; ANALYZE-FULL: do_store.8:
; ANALYZE-FULL-NEXT: [[GEP_8:%.*]] = getelementptr i32, ptr [[A]], i32 8
-; ANALYZE-FULL-NEXT: store i32 8, ptr [[GEP_8]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_8:%.*]] = load i32, ptr [[GEP_8]], align 4
+; ANALYZE-FULL-NEXT: [[DATA_MUL_8:%.*]] = mul i32 [[DATA_8]], 2
+; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_8]], ptr [[GEP_8]], align 4
; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_8]]
; ANALYZE-FULL: for.next.8:
; ANALYZE-FULL-NEXT: ret i32 9
@@ -87,7 +105,10 @@ define i32 @foo(ptr %a) {
; DONT-ANALYZE-FULL-NEXT: br i1 [[CMP2]], label [[DO_STORE:%.*]], label [[FOR_NEXT]]
; DONT-ANALYZE-FULL: do_store:
; DONT-ANALYZE-FULL-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[INDVAR]]
-; DONT-ANALYZE-FULL-NEXT: store i32 [[INDVAR]], ptr [[GEP]], align 4
+; DONT-ANALYZE-FULL-NEXT: [[DATA:%.*]] = load i32, ptr [[GEP]], align 4
+; DONT-ANALYZE-FULL-NEXT: [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2
+; DONT-ANALYZE-FULL-NEXT: [[DATA_ADD:%.*]] = add i32 [[DATA_MUL]], 1
+; DONT-ANALYZE-FULL-NEXT: store i32 [[DATA_MUL]], ptr [[GEP]], align 4
; DONT-ANALYZE-FULL-NEXT: br label [[FOR_NEXT]]
; DONT-ANALYZE-FULL: for.next:
; DONT-ANALYZE-FULL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INDVAR_NEXT]], 9
@@ -108,7 +129,10 @@ for.body:
do_store:
%gep = getelementptr i32, ptr %a, i32 %indvar
- store i32 %indvar, ptr %gep
+ %data = load i32, ptr %gep
+ %data_mul = mul i32 %data, 2
+ %data_add = add i32 %data_mul, 1
+ store i32 %data_mul, ptr %gep
br label %for.next
for.next:
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll
new file mode 100644
index 00000000000000..1658af6dd55b92
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll
@@ -0,0 +1,354 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=loop-unroll -unroll-threshold=25 < %s | FileCheck %s
+
+; All functions are simple variations of the same double nested loop with an
+; if/then/else-like CFG structure in the outer loop. The unrolling threshold is
+; set manually so that it is just slightly higher than the estimated unrolled
+; cost of the outer loop in the baseline, even after unroll cost savings
+; analysis.
+
+; Baseline. Inner loop's bounds and if/then/else's condition depend on function
+; arguments. No unrolling happens.
+
+define void @no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub, i32 noundef %ifcond) {
+; CHECK-LABEL: @no_fullunroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
+; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]]
+; CHECK: outer.if:
+; CHECK-NEXT: [[IF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[OUTER_IV_EXT]]
+; CHECK-NEXT: [[MOD2:%.*]] = and i32 [[IFCOND:%.*]], 1
+; CHECK-NEXT: [[IF_COND:%.*]] = icmp ult i32 [[MOD2]], 0
+; CHECK-NEXT: br i1 [[IF_COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 1, ptr [[IF_ADDR]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]]
+; CHECK: if.else:
+; CHECK-NEXT: store i32 2, ptr [[IF_ADDR]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 2
+; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]]
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if: ; preds = %inner.header_latch_exiting
+ %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+ %mod2 = and i32 %ifcond, 1
+ %if.cond = icmp ult i32 %mod2, 0
+ br i1 %if.cond, label %if.then, label %if.else
+
+if.then: ; preds = %outer.if
+ store i32 1, ptr %if.addr
+ br label %outer.latch_exiting
+
+if.else: ; preds = %outer.if
+ store i32 2, ptr %if.addr
+ br label %outer.latch_exiting
+
+outer.latch_exiting: ; preds = %if.then, %if.else
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 2
+ br i1 %outer.cond, label %outer.header, label %end
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; Inner loop's bounds depend on constants and outer IV, yielding extra cost
+; savings. These are enough to fully unroll the outer loop.
+
+define void @save_subloop(ptr noundef %mem, i32 noundef %ifcond) {
+; CHECK-LABEL: @save_subloop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 2
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK: outer.if:
+; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 1, ptr [[MEM]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK: if.else:
+; CHECK-NEXT: store i32 2, ptr [[MEM]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 2
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK: outer.if.1:
+; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]]
+; CHECK: if.else.1:
+; CHECK-NEXT: store i32 2, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2:%.*]]
+; CHECK: if.then.1:
+; CHECK-NEXT: store i32 1, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, 2
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if: ; preds = %inner.header_latch_exiting
+ %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+ %mod2 = and i32 %ifcond, 1
+ %if.cond = icmp ult i32 %mod2, 0
+ br i1 %if.cond, label %if.then, label %if.else
+
+if.then: ; preds = %outer.if
+ store i32 1, ptr %if.addr
+ br label %outer.latch_exiting
+
+if.else: ; preds = %outer.if
+ store i32 2, ptr %if.addr
+ br label %outer.latch_exiting
+
+outer.latch_exiting: ; preds = %if.then, %if.else
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 2
+ br i1 %outer.cond, label %outer.header, label %end
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; If/then/else's condition depends on constants and outer IV, yielding extra
+; cost savings. These are enough to fully unroll the outer loop.
+
+define void @save_ifthenelse(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @save_ifthenelse(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK: outer.if:
+; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 1, ptr [[MEM]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK: if.else:
+; CHECK-NEXT: store i32 2, ptr [[MEM]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK: outer.if.1:
+; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]]
+; CHECK: if.else.1:
+; CHECK-NEXT: store i32 2, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2:%.*]]
+; CHECK: if.then.1:
+; CHECK-NEXT: store i32 1, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if: ; preds = %inner.header_latch_exiting
+ %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+ %mod2 = and i32 %outer.iv, 1
+ %if.cond = icmp ult i32 %mod2, 0
+ br i1 %if.cond, label %if.then, label %if.else
+
+if.then: ; preds = %outer.if
+ store i32 1, ptr %if.addr
+ br label %outer.latch_exiting
+
+if.else: ; preds = %outer.if
+ store i32 2, ptr %if.addr
+ br label %outer.latch_exiting
+
+outer.latch_exiting: ; preds = %if.then, %if.else
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 2
+ br i1 %outer.cond, label %outer.header, label %end
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+
+; Tests whether an if/then-like CFG structure is also recognized as a cost
+; saving opportunity. Same double nested loop as before, but the if's else
+; branch is removed and two extra instructions are added to the then branch to
+; maintain the same loop size.
+
+define void @save_ifthen(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @save_ifthen(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]]
+; CHECK: outer.if:
+; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 0, ptr [[MEM]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK: outer.if.1:
+; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK: if.then.1:
+; CHECK-NEXT: store i32 4, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_1]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if: ; preds = %inner.header_latch_exiting
+ %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+ %mod2 = and i32 %outer.iv, 1
+ %if.cond = icmp ult i32 %mod2, 0
+ br i1 %if.cond, label %if.then, label %outer.latch_exiting
+
+if.then: ; preds = %outer.if
+ %mod2x2 = mul i32 %mod2, 2
+ %mod2x2x2 = mul i32 %mod2x2, 2
+ store i32 %mod2x2x2, ptr %if.addr
+ br label %outer.latch_exiting
+
+outer.latch_exiting: ; preds = %if.then, $outer.if
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 2
+ br i1 %outer.cond, label %outer.header, label %end
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
More information about the llvm-commits
mailing list