[llvm] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using implicit kernel argument if feasible (PR #103490)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 13 17:41:52 PDT 2024


https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/103490

None

>From be06d03e467e739dacac042f290c7b9be2e01d8c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 13 Aug 2024 20:33:19 -0400
Subject: [PATCH] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using
 implicit kernel argument if feasible

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   7 +-
 .../abi-attribute-hints-undefined-behavior.ll | 104 ++++++++++++------
 .../callee-special-input-sgprs-fixed-abi.ll   |   3 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |  22 ++--
 4 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86fc100f1c2da0..52f93472eac206 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
   }
-  case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
+    const Module *M = DAG.getMachineFunction().getFunction().getParent();
+    if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
+      return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
+    [[fallthrough]];
+  }
+  case Intrinsic::amdgcn_dispatch_ptr: {
     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
       DiagnosticInfoUnsupported BadIntrin(
           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..dcbe3363f5874a 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -204,26 +204,50 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 }
 
 define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
-; FIXEDABI-LABEL: marked_func_use_other_sgpr:
-; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s7
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s9
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s4
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s5
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s11
-; FIXEDABI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
+; FIXEDABI-SDAG-LABEL: marked_func_use_other_sgpr:
+; FIXEDABI-SDAG:       ; %bb.0:
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
+; FIXEDABI-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v3, s7
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v3, s9
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; FIXEDABI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; FIXEDABI-GISEL-LABEL: marked_func_use_other_sgpr:
+; FIXEDABI-GISEL:       ; %bb.0:
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s9
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; FIXEDABI-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -236,18 +260,34 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 }
 
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
-; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
-; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_endpgm
+; FIXEDABI-SDAG-LABEL: marked_kernel_use_other_sgpr:
+; FIXEDABI-SDAG:       ; %bb.0:
+; FIXEDABI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd0
+; FIXEDABI-SDAG-NEXT:    s_add_u32 s2, s4, 8
+; FIXEDABI-SDAG-NEXT:    s_addc_u32 s3, s5, 0
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v2, v[2:3] glc
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-SDAG-NEXT:    s_endpgm
+;
+; FIXEDABI-GISEL-LABEL: marked_kernel_use_other_sgpr:
+; FIXEDABI-GISEL:       ; %bb.0:
+; FIXEDABI-GISEL-NEXT:    s_add_u32 s0, s4, 8
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT:    s_addc_u32 s1, s5, 0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-GISEL-NEXT:    s_endpgm
   %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..c2c6670c3ecacd 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_mov_b64 s[4:5], 0xc8
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
 define hidden void @use_queue_ptr() #1 {
   %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
   %value = load volatile i32, ptr addrspace(4) %queue_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..8999b8f0d07678 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT:    s_add_u32 s2, s6, 8
+; GFX8V5-NEXT:    s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8V5-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8V5-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V5:       ; %bb.0:
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[0:1] glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[6:7] offset:8 glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)



More information about the llvm-commits mailing list