[llvm] [AMDGPU] Pre-commit tests for preload kernarg move to attributor (PR #123546)

Sun Jan 19 20:53:58 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Austin Kerbow (kerbowa)

<details>
<summary>Changes</summary>

NFC. Since the handling will be moved to the middlend the tests need to be reorganized. The IR tests are expanded since end-to-end testing is no longer baked in.

---

Patch is 279.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123546.diff


4 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll (+638-74) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll (+116-477) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll (+1319-323) 
- (removed) llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll (-263) 


``````````diff

diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
index aeb7faade47150..180f01257f1f6d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
@@ -1,20 +1,72 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
+
+define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
 
 define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; NO-PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
 ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
 ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
 ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
 ; NO-PRELOAD-NEXT:    ret void
 ;
-; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
 ; PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
 ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -27,9 +79,34 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
   ret void
 }
 
-define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] {
+define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT:    store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
 ; NO-PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
 ; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
 ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
@@ -38,8 +115,8 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
 ; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
 ; NO-PRELOAD-NEXT:    ret void
 ;
-; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
 ; PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
 ; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -52,31 +129,335 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
   ret void
 }
 
-define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT:    [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
+define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
 ; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
 ; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT:    store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; NO-PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i64, ptr addrspace(4) %imp_arg_ptr
+  store i64 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; NO-PRELOAD-NEXT:    store i16 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; PRELOAD-NEXT:    store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i16, ptr addrspace(4) %imp_arg_ptr
+  store i16 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; PRELOAD-NEXT:    store i32 [[_HIDDEN_BLOCK_COUNT_Y]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT:    ret void
+;
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT:    [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
+; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT:    ret void
+;
+; PRELOAD-LABE...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/123546