[llvm] [AMDGPU] Add flag to enable expensive trip counts for runtime unroll. (PR #171735)

Fri Dec 12 14:58:04 PST 2025

================
@@ -0,0 +1,77 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s
+; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s
+
+
+; CHECK-LABEL: @_Z6kernelPilll(
+; CHECK: for.body:
+
+; CHECK-NOUNROLL: store
+; CHECK-NOUNROLL-NOT: store
+; CHECK-NOUNROLL: br
+
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: store
+; CHECK-UNROLL: br
+
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
+define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 {
+entry:
+  %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %1 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %2 = load i32, ptr addrspace(4) %0, align 4
+  %3 = icmp ult i32 %1, %2
+  %4 = select i1 %3, i64 12, i64 18
+  %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4
+  %6 = load i16, ptr addrspace(4) %5, align 2
+  %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+  %mul = mul nsw i64 %k, %n
+  %mul.i.i = sub nsw i64 0, %s
+  %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i
+  %cmp1.i.i = icmp slt i64 %mul, %s
----------------
shiltian wrote:

I don't think it has to be exactly the same as where you got it originally from. A small reproducer is enough as long as it reflects the intent of the change. There are a lot of unnecessary attributes here as well.

https://github.com/llvm/llvm-project/pull/171735