[llvm] [AMDGPU] Add flag to enable expensive trip counts for runtime unroll. (PR #171735)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 07:55:30 PST 2025
https://github.com/carlobertolli updated https://github.com/llvm/llvm-project/pull/171735
>From b49ad87bdf12266c40398c292cdf221b5db48081 Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <carlo.bertolli at amd.com>
Date: Wed, 10 Dec 2025 16:23:18 -0600
Subject: [PATCH] [AMDGPU] Add flag to enable expensive trip counts for runtime
unroll.
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 ++
.../LoopUnroll/AMDGPU/expensive-tripcount.ll | 77 +++++++++++++++++++
2 files changed, 85 insertions(+)
create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 35406a387cf0f..e5f6946942063 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -83,6 +83,12 @@ static cl::opt<unsigned> MemcpyLoopUnroll(
"operations when lowering memcpy as a loop"),
cl::init(16), cl::Hidden);
+static cl::opt<bool> UnrollRuntimeExpensiveTripCount(
+ "amdgpu-unroll-runtime-expensive-trip-count",
+ cl::desc("Allow emitting expensive instructions (such as divisions) when "
+ "computing the trip count of a loop for runtime unrolling"),
+ cl::init(false), cl::Hidden);
+
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
unsigned Depth = 0) {
const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -124,6 +130,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(
// We want to run unroll even for the loops which have been vectorized.
UP.UnrollVectorizedLoop = true;
+ UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount;
+
// TODO: Do we want runtime unrolling?
// Maximum alloca size than can fit registers. Reserve 16 registers.
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll
new file mode 100644
index 0000000000000..a53ae467b59ec
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll
@@ -0,0 +1,77 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s
+; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s
+
+
+; CHECK-LABEL: @_Z6kernelPilll(
+; CHECK: for.body:
+
+; CHECK-NOUNROLL: store
+; CHECK-NOUNROLL-NOT: store
+; CHECK-NOUNROLL: br
+
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: br
+
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
+define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 {
+entry:
+ %impl.arg.ptr= tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %bid = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %impl.arg = load i32, ptr addrspace(4) %impl.arg.ptr, align 4
+ %cmp.ult = icmp ult i32 %bid, %impl.arg
+ %sel.i = select i1 %cmp.ult, i64 12, i64 18
+ %gep = getelementptr inbounds nuw i8, ptr addrspace(4) %impl.arg.ptr, i64 %sel.i
+ %ld = load i16, ptr addrspace(4) %gep, align 2
+ %tid = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+ %mul = mul nsw i64 %k, %n
+ %mul.i.i = sub nsw i64 0, %s
+ %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i
+ %cmp1.i.i = icmp slt i64 %mul, %s
+ %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i
+ %cmp.i.i = icmp eq i64 %s, 0
+ %add.i = add i64 %mul, 1
+ %or.cond.i.i.i.i = icmp ult i64 %add.i, 2
+ %mul.i.i.i = lshr i64 %mul, 63
+ %spec.select.i.i.i.i = add nsw i64 %mul.i.i.i, %mul
+ %cmp2.i.i = icmp slt i64 %mul, 0
+ %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0
+ %spec.select.i.i = add nsw i64 %add.i.i, %mul
+ %conv.i.i4 = zext i16 %ld to i32
+ %and = and i1 %cmp.i.i, %or.cond.i.i.i.i
+ %spec.select = select i1 %and, i64 %spec.select.i.i.i.i, i64 -1
+ %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select
+ %conv5 = zext nneg i32 %tid to i64
+ %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ]
+ %i3.07 = phi i32 [ %add, %for.body ], [ %tid, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8
+ store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4
+ %add = add nuw nsw i32 %i3.07, %conv.i.i4
+ %conv = zext nneg i32 %add to i64
+ %cmp = icmp sgt i64 %retval.0.i.i, %conv
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152
+}
+
+!47 = !{!"llvm.loop.mustprogress"}
+!152 = distinct !{!152, !47, !153}
+!153 = !{!"llvm.loop.unroll.enable"}
More information about the llvm-commits
mailing list