[llvm] [AMDGPU][TTI] Threshold bonus to loops whose unrolling makes nested loops unrollable (PR #114579)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 4 02:54:27 PST 2024
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/114579
>From f56c4321413f3fd567b4044f1cc9521da65e6797 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Fri, 1 Nov 2024 18:03:14 +0100
Subject: [PATCH 1/2] Give cond. loop threshold bonus to outer loop in loop
nests
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 68 ++++++++++++++-
.../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 84 +++++++++++++++++++
2 files changed, 151 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5160851f8c4424..79250ad1f83064 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(200), cl::Hidden);
+static cl::opt<unsigned> UnrollThresholdNestedStatic(
+ "amdgpu-unroll-threshold-nested-static",
+ cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
+ "trip count will be made runtime-independent when fully-unrolling "
+ "the outer loop"),
+ cl::init(200), cl::Hidden);
+
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
}
-
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+
+ if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+ // Look for subloops whose trip count would go from runtime-dependent to
+ // runtime-independent if we were to unroll the loop. Give a bonus to the
+ // current loop's unrolling threshold for each of these, as fully unrolling
+ // it would likely expose additional optimization opportunities.
+ for (const Loop *SubLoop : L->getSubLoops()) {
+ std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+ if (!Bounds)
+ continue;
+ Value *InitIV = &Bounds->getInitialIVValue();
+ Value *FinalIV = &Bounds->getFinalIVValue();
+ Value *StepVal = Bounds->getStepValue();
+ if (!StepVal)
+ continue;
+
+ // Determines whether SubIV's derivation depends exclusively on constants
+ // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
+ // involved in the derivation.
+ bool SubIVDependsOnIV = false;
+ std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
+ [&](const Value *SubIV, unsigned Depth) -> bool {
+ if (SubIV == IV) {
+ SubIVDependsOnIV = true;
+ return true;
+ }
+ if (isa<Constant>(SubIV))
+ return true;
+ if (Depth >= 10)
+ return false;
+
+ const Instruction *I = dyn_cast<Instruction>(SubIV);
+ // No point in checking outside the loop since IV is necessarily inside
+ // it; also stop searching when encountering an instruction that will
+ // likely not allow SubIV's value to be statically computed.
+ if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
+ return false;
+
+ // SubIV depends on constants or IV if all of the instruction's
+ // operands involved in its derivation also depend on constants or IV.
+ return llvm::all_of(I->operand_values(), [&](const Value *V) {
+ return FromConstsOrLoopIV(V, Depth + 1);
+ });
+ };
+
+ if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
+ FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
+ UP.Threshold += UnrollThresholdNestedStatic;
+ LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+ << " for loop:\n"
+ << *L
+ << " due to subloop's trip count becoming "
+ "runtime-independent after unrolling:\n "
+ << *SubLoop);
+ if (UP.Threshold >= MaxBoost)
+ return;
+ }
+ }
+ }
+
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getDataLayout();
unsigned LocalGEPsSeen = 0;
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
new file mode 100644
index 00000000000000..36101c50db98ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -0,0 +1,84 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+
+; For @dependent_sub_fullunroll, the threshold bonus should apply
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
+; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
+; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+
+; Check that the outer loop of a double-nested loop where the inner loop's trip
+; count depends exclusively on constants and the outer IV is fully unrolled
+; thanks to receiving a threshold bonus in AMDGPU's TTI.
+
+; CHECK-LABEL: @dependent_sub_fullunroll
+; CHECK: inner.header_latch_exiting.7
+; CHECK: outer.latch_exiting.7
+
+define void @dependent_sub_fullunroll(ptr noundef %mem) {
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, 8
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 8
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
+; is not fully unrolled when the inner loop's final IV value depends on a
+; function argument instead of a combination of the outer IV and constants.
+
+; CHECK-LABEL: @dependent_sub_no_fullunroll
+; CHECK-NOT: outer.latch_exiting.7
+; CHECK-NOT: outer.latch_exiting.7
+
+define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 8
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}
>From 0a72dca4142ab896b39bd89f3c5fdab4e1ed6bd8 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Mon, 4 Nov 2024 11:53:43 +0100
Subject: [PATCH 2/2] Address reviewers' comments
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +-
.../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 183 ++++++++++++++++--
2 files changed, 167 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 79250ad1f83064..8d6eb94af4a108 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -157,7 +157,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
- if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+ if (PHINode *IV = L->getInductionVariable(SE)) {
// Look for subloops whose trip count would go from runtime-dependent to
// runtime-independent if we were to unroll the loop. Give a bonus to the
// current loop's unrolling threshold for each of these, as fully unrolling
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
index 36101c50db98ac..97de4cbf0936c6 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s
; For @dependent_sub_fullunroll, the threshold bonus should apply
; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
@@ -6,15 +8,63 @@
; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
; Check that the outer loop of a double-nested loop where the inner loop's trip
; count depends exclusively on constants and the outer IV is fully unrolled
; thanks to receiving a threshold bonus in AMDGPU's TTI.
-; CHECK-LABEL: @dependent_sub_fullunroll
-; CHECK: inner.header_latch_exiting.7
-; CHECK: outer.latch_exiting.7
-
define void @dependent_sub_fullunroll(ptr noundef %mem) {
+; CHECK-LABEL: @dependent_sub_fullunroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_2:%.*]]
+; CHECK: inner.header_latch_exiting.2:
+; CHECK-NEXT: [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64
+; CHECK-NEXT: [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]]
+; CHECK-NEXT: [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_2]], align 4
+; CHECK-NEXT: [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.2:
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_3:%.*]]
+; CHECK: inner.header_latch_exiting.3:
+; CHECK-NEXT: [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1
+; CHECK-NEXT: [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64
+; CHECK-NEXT: [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]]
+; CHECK-NEXT: [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_3]], align 4
+; CHECK-NEXT: [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8
+; CHECK-NEXT: br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.3:
+; CHECK-NEXT: ret void
+;
entry:
br label %outer.header
@@ -26,9 +76,9 @@ inner.header_latch_exiting: ; preds = %outer.h
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, 8
@@ -36,9 +86,9 @@ inner.header_latch_exiting: ; preds = %outer.h
outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 8
+ %outer.cond = icmp ult i32 %outer.iv_next, 4
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
+
end: ; preds = %outer.latch_exiting
ret void
}
@@ -47,11 +97,45 @@ end: ; preds = %outer.l
; is not fully unrolled when the inner loop's final IV value depends on a
; function argument instead of a combination of the outer IV and constants.
-; CHECK-LABEL: @dependent_sub_no_fullunroll
-; CHECK-NOT: outer.latch_exiting.7
-; CHECK-NOT: outer.latch_exiting.7
-
define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @dependent_sub_no_fullunroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ]
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK: inner.header_latch_exiting.1:
+; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64
+; CHECK-NEXT: [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16
+; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]]
+; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting.1:
+; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2
+; CHECK-NEXT: [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4
+; CHECK-NEXT: br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
entry:
br label %outer.header
@@ -63,9 +147,9 @@ inner.header_latch_exiting: ; preds = %outer.h
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
- %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
- %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
@@ -73,9 +157,74 @@ inner.header_latch_exiting: ; preds = %outer.h
outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
- %outer.cond = icmp ult i32 %outer.iv_next, 8
+ %outer.cond = icmp ult i32 %outer.iv_next, 4
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-
+
+end: ; preds = %outer.latch_exiting
+ ret void
+}
+
+; Make sure that the threshold bonus does not override a correctness check and
+; unrolling when a convergent operation that is illegal to unroll is present.
+; The loop nest is the same as before except for the fact that the outer
+; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled
+; convergent call in the outer loop's header. Were the call non-convergent,
+; the outer loop would be partially unrolled by a factor of 2, with a breakout
+; of 1.
+
+declare void @convergent_operation() convergent
+
+define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) {
+; CHECK-LABEL: @dont_unroll_illegal_convergent_op(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
+; CHECK-NEXT: call void @convergent_operation()
+; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK: inner.header_latch_exiting:
+; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]]
+; CHECK: outer.latch_exiting:
+; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11
+; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header: ; preds = %entry, %outer.latch_exiting
+ %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+ call void @convergent_operation()
+ br label %inner.header_latch_exiting
+
+inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
+ %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+ %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+ %outer.iv.ext = zext nneg i32 %outer.iv to i64
+ %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+ %inner.iv.ext = zext nneg i32 %inner.iv to i64
+ %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+ %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+ store i32 0, ptr %addr
+ %inner.cond = icmp ult i32 %inner.iv_next, 8
+ br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting: ; preds = %inner.header_latch_exiting
+ %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+ %outer.cond = icmp ult i32 %outer.iv_next, 11
+ br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
end: ; preds = %outer.latch_exiting
ret void
}
More information about the llvm-commits
mailing list