[llvm] [AMDGPU] Add dereferenceable retAttr to a call to llvm.amdgcn.implicitarg.ptr in amdgpu-codegenprepare (PR #182206)

Thu Feb 19 09:07:49 PST 2026

================
@@ -0,0 +1,141 @@
+; RUN: opt -S -passes='amdgpu-codegenprepare' -mtriple=amdgcn-amd-amdhsa -disable-verify < %s | FileCheck %s --check-prefix=DEREF
+; RUN: opt -S -passes='loop-mssa(licm)' -mtriple=amdgcn-amd-amdhsa -disable-verify < %s | FileCheck %s --check-prefix=WITHOUT
+; RUN: opt -S -passes='amdgpu-codegenprepare,loop-mssa(licm)' -mtriple=amdgcn-amd-amdhsa -amdgpu-codegenprepare-mark-dereferenceable-calls=false < %s  \ 
+; RUN:    | FileCheck %s --check-prefix=WITHOUT
+; RUN: opt -S -passes='amdgpu-codegenprepare,loop-mssa(licm)' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s --check-prefix=WITH
+
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+
+define protected amdgpu_kernel void @foo(ptr addrspace(1) noundef readonly captures(none) %d_a.coerce, ptr addrspace(1) noundef readonly captures(none) %d_b.coerce, ptr addrspace(1) noundef writeonly captures(none) %d_c.coerce, i32 noundef %count) local_unnamed_addr #0 {
+; DEREF-LABEL: define protected amdgpu_kernel void @foo(
+; DEREF:    tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+;
+; WITHOUT-LABEL: define protected amdgpu_kernel void @foo(
+; WITHOUT-SAME: ptr addrspace(1) noundef readonly captures(none) [[D_A_COERCE:%.*]], ptr addrspace(1) noundef readonly captures(none) [[D_B_COERCE:%.*]], ptr addrspace(1) noundef writeonly captures(none) [[D_C_COERCE:%.*]], i32 noundef [[COUNT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; WITHOUT-NEXT:  [[ENTRY:.*:]]
+; WITHOUT-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; WITHOUT-NEXT:    [[CMP11:%.*]] = icmp samesign ult i32 [[TMP0]], 4
+; WITHOUT-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; WITHOUT:       [[FOR_BODY_LR_PH]]:
+; WITHOUT-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+; WITHOUT-NEXT:    [[TMP2:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; WITHOUT-NEXT:    [[CMP79:%.*]] = icmp samesign ult i32 [[TMP2]], 4
+; WITHOUT-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+; WITHOUT-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], 192
+; WITHOUT-NEXT:    [[TMP3:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; WITHOUT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP3]], i64 12
+; WITHOUT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP3]], i64 14
+; WITHOUT-NEXT:    [[DOTIN_I_I_I:%.*]] = load i16, ptr addrspace(4) [[TMP5]], align 2, !tbaa [[SHORT_TBAA6:![0-9]+]]
+; WITHOUT-NEXT:    [[CONV_I_I:%.*]] = zext i16 [[DOTIN_I_I_I]] to i32
+; WITHOUT-NEXT:    br label %[[FOR_BODY:.*]]
+;
+; WITH-LABEL: define protected amdgpu_kernel void @foo(
+; WITH-SAME: ptr addrspace(1) noundef readonly captures(none) [[D_A_COERCE:%.*]], ptr addrspace(1) noundef readonly captures(none) [[D_B_COERCE:%.*]], ptr addrspace(1) noundef writeonly captures(none) [[D_C_COERCE:%.*]], i32 noundef [[COUNT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; WITH-NEXT:  [[ENTRY:.*:]]
+; WITH-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; WITH-NEXT:    [[CMP11:%.*]] = icmp samesign ult i32 [[TMP0]], 4
+; WITH-NEXT:    br i1 [[CMP11]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; WITH:       [[FOR_BODY_LR_PH]]:
+; WITH-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+; WITH-NEXT:    [[TMP2:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; WITH-NEXT:    [[CMP79:%.*]] = icmp samesign ult i32 [[TMP2]], 4
+; WITH-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+; WITH-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], 192
+; WITH-NEXT:    [[TMP3:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; WITH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP3]], i64 12
+; WITH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP3]], i64 14
+; WITH-NEXT:    [[DOTIN_I_I_I:%.*]] = load i16, ptr addrspace(4) [[TMP5]], align 2, !tbaa [[SHORT_TBAA6:![0-9]+]]
+; WITH-NEXT:    [[CONV_I_I:%.*]] = zext i16 [[DOTIN_I_I_I]] to i32
+; WITH-NEXT:    [[DOTIN_I_I_I7:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 4
----------------
yoonseoch wrote:

This load and following zext is hoisted into outer loop's preheader. 

https://github.com/llvm/llvm-project/pull/182206