[llvm] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using implicit kernel argument if feasible (PR #103490)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 13 17:41:52 PDT 2024
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/103490
None
>From be06d03e467e739dacac042f290c7b9be2e01d8c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 13 Aug 2024 20:33:19 -0400
Subject: [PATCH] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using
implicit kernel argument if feasible
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +-
.../abi-attribute-hints-undefined-behavior.ll | 104 ++++++++++++------
.../callee-special-input-sgprs-fixed-abi.ll | 3 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 22 ++--
4 files changed, 94 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86fc100f1c2da0..52f93472eac206 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
- case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
+ return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
+ [[fallthrough]];
+ }
+ case Intrinsic::amdgcn_dispatch_ptr: {
if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..dcbe3363f5874a 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -204,26 +204,50 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
}
define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
-; FIXEDABI-LABEL: marked_func_use_other_sgpr:
-; FIXEDABI: ; %bb.0:
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
-; FIXEDABI-NEXT: v_mov_b32_e32 v3, s7
-; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
-; FIXEDABI-NEXT: v_mov_b32_e32 v3, s9
-; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s4
-; FIXEDABI-NEXT: v_mov_b32_e32 v3, s5
-; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
-; FIXEDABI-NEXT: v_mov_b32_e32 v3, s11
-; FIXEDABI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: s_setpc_b64 s[30:31]
+; FIXEDABI-SDAG-LABEL: marked_func_use_other_sgpr:
+; FIXEDABI-SDAG: ; %bb.0:
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
+; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s7
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; FIXEDABI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; FIXEDABI-GISEL-LABEL: marked_func_use_other_sgpr:
+; FIXEDABI-GISEL: ; %bb.0:
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s7
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s9
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s10
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s11
+; FIXEDABI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: s_setpc_b64 s[30:31]
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -236,18 +260,34 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
}
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
-; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
-; FIXEDABI: ; %bb.0:
-; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
-; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
-; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT: s_endpgm
+; FIXEDABI-SDAG-LABEL: marked_kernel_use_other_sgpr:
+; FIXEDABI-SDAG: ; %bb.0:
+; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd0
+; FIXEDABI-SDAG-NEXT: s_add_u32 s2, s4, 8
+; FIXEDABI-SDAG-NEXT: s_addc_u32 s3, s5, 0
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-SDAG-NEXT: s_endpgm
+;
+; FIXEDABI-GISEL-LABEL: marked_kernel_use_other_sgpr:
+; FIXEDABI-GISEL: ; %bb.0:
+; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s4, 8
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s5, 0
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT: s_endpgm
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..c2c6670c3ecacd 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
}
; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_mov_b64 s[4:5], 0xc8
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
define hidden void @use_queue_ptr() #1 {
%queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
%value = load volatile i32, ptr addrspace(4) %queue_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..8999b8f0d07678 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_add_u32 s0, s6, 8
-; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT: s_add_u32 s2, s6, 8
+; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
+; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
+; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
-; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX9V5: ; %bb.0:
+; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
+; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list