[llvm] [AMDGPU] Optimize block count calculations to the new ABI (PR #174112)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 1 00:35:54 PST 2026
================
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck %s
+
+define i32 @num_blocks(i32 noundef %dim) {
+; CHECK-LABEL: define i32 @num_blocks(
+; CHECK-SAME: i32 noundef [[DIM:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: switch i32 [[DIM]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT: i32 0, label %[[DIM_X:.*]]
+; CHECK-NEXT: i32 1, label %[[DIM_Y:.*]]
+; CHECK-NEXT: i32 2, label %[[DIM_Z:.*]]
+; CHECK-NEXT: ]
+; CHECK: [[DIM_X]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[DIM_Y]]:
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i64 4
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[DIM_Z]]:
+; CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i64 8
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[DEFAULT]]:
+; CHECK-NEXT: unreachable
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RETVAL_0_I_IN:%.*]] = phi ptr addrspace(4) [ [[TMP0]], %[[DIM_X]] ], [ [[TMP2]], %[[DIM_Y]] ], [ [[TMP4]], %[[DIM_Z]] ]
+; CHECK-NEXT: [[RETVAL_0_I:%.*]] = load i32, ptr addrspace(4) [[RETVAL_0_I_IN]], align 4
+; CHECK-NEXT: ret i32 [[RETVAL_0_I]]
+;
+entry:
+ switch i32 %dim, label %default [
+ i32 0, label %dim_x
+ i32 1, label %dim_y
+ i32 2, label %dim_z
+ ]
+
+dim_x:
+ %0 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ %1 = getelementptr i8, ptr addrspace(4) %0, i32 12
+ %2 = load i32, ptr addrspace(4) %1, align 4
+ %3 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %4 = getelementptr i8, ptr addrspace(4) %3, i32 12
+ %5 = load i16, ptr addrspace(4) %4, align 2
+ %conv_x = zext i16 %5 to i32
+ %count_x = udiv i32 %2, %conv_x
+ br label %exit
+
+dim_y:
+ %6 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ %7 = getelementptr i8, ptr addrspace(4) %6, i32 16
+ %8 = load i32, ptr addrspace(4) %7, align 4
+ %9 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %10 = getelementptr i8, ptr addrspace(4) %9, i32 14
+ %11 = load i16, ptr addrspace(4) %10, align 2
+ %conv_y = zext i16 %11 to i32
+ %count_y = udiv i32 %8, %conv_y
+ br label %exit
+
+dim_z:
+ %12 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ %13 = getelementptr i8, ptr addrspace(4) %12, i32 20
+ %14 = load i32, ptr addrspace(4) %13, align 4
+ %15 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %16 = getelementptr i8, ptr addrspace(4) %15, i32 16
+ %17 = load i16, ptr addrspace(4) %16, align 2
+ %conv_z = zext i16 %17 to i32
----------------
arsenm wrote:
Needs more negative tests for cases that don't match the pattern. Wrong offsets, wrong types, wrong casts, missing casts
https://github.com/llvm/llvm-project/pull/174112
More information about the llvm-commits
mailing list