[llvm] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using implicit kernel argument if feasible (PR #103490)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 13 20:45:57 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/103490

>From f3ca3ef4fca1c3f1b81278bc3823a791fb5efe2b Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 13 Aug 2024 20:33:19 -0400
Subject: [PATCH] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using
 implicit kernel argument if feasible

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 31 ++++++++++++++++++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |  3 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  7 ++++-
 ...licit-kernarg-backend-usage-global-isel.ll | 21 ++++++++-----
 .../abi-attribute-hints-undefined-behavior.ll | 18 +++++++----
 .../callee-special-input-sgprs-fixed-abi.ll   |  3 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  | 22 ++++++++-----
 7 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c6c4b8f9306471..c0c842382622c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4456,6 +4456,27 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
   return true;
 }
 
+/// Legalize a value that's loaded from implicit kernel arguments.
+bool AMDGPULegalizerInfo::legalizeImplicitKernelargParameterPtr(
+    MachineInstr &MI, MachineIRBuilder &B, LLT Ty, unsigned Offset) const {
+  MachineFunction &MF = *MI.getMF();
+  Module *M = MF.getFunction().getParent();
+
+  assert(AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5);
+
+  Register Ptr = getKernargParameterPtr(B, Offset);
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo,
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      Ty, commonAlignment(Align(Ty.getSizeInBits()), Offset));
+  Register Temp = B.buildLoad(Ty, Ptr, *MMO).getReg(0);
+  B.buildCopy(MI.getOperand(0).getReg(), Temp);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
                                        MachineIRBuilder &B) const {
@@ -7312,9 +7333,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_dispatch_ptr:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
-  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::amdgcn_queue_ptr: {
+    MachineFunction &MF = *MI.getMF();
+    Module *M = MF.getFunction().getParent();
+    if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+      uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset(
+          B.getMF(), AMDGPUTargetLowering::QUEUE_PTR);
+      return legalizeImplicitKernelargParameterPtr(MI, B, S64, Offset);
+    }
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
+  }
   case Intrinsic::amdgcn_implicit_buffer_ptr:
     return legalizePreloadedArgIntrin(
       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index db1c5874093a71..8aa83eeee420ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -131,6 +131,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
                                    uint64_t Offset,
                                    Align Alignment = Align(4)) const;
+  bool legalizeImplicitKernelargParameterPtr(MachineInstr &MI,
+                                             MachineIRBuilder &B, LLT Ty,
+                                             unsigned Offset) const;
 
   bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
                                MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86fc100f1c2da0..52f93472eac206 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
   }
-  case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
+    const Module *M = DAG.getMachineFunction().getFunction().getParent();
+    if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
+      return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
+    [[fallthrough]];
+  }
+  case Intrinsic::amdgcn_dispatch_ptr: {
     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
       DiagnosticInfoUnsupported BadIntrin(
           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9443b39dcdc033..0534623539a5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -295,14 +295,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT:    s_add_u32 s2, s6, 8
+; GFX8V5-NEXT:    s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -336,15 +340,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V5:       ; %bb.0:
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V5-NEXT:    global_load_ubyte v0, v[0:1], off glc
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[0:1] glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[6:7] offset:8 glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..d2f8487f9d13d5 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -207,6 +207,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_func_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-NEXT:    s_mov_b64 s[6:7], 0xc8
+; FIXEDABI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s7
 ; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
@@ -238,12 +241,15 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
+; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd0
+; FIXEDABI-NEXT:    s_add_u32 s2, s4, 8
+; FIXEDABI-NEXT:    s_addc_u32 s3, s5, 0
+; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s2
+; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s3
+; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s1
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s0
+; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..c2c6670c3ecacd 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_mov_b64 s[4:5], 0xc8
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
 define hidden void @use_queue_ptr() #1 {
   %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
   %value = load volatile i32, ptr addrspace(4) %queue_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..8999b8f0d07678 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT:    s_add_u32 s2, s6, 8
+; GFX8V5-NEXT:    s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8V5-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8V5-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V5:       ; %bb.0:
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xd0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[0:1] glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[6:7] offset:8 glc
 ; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)



More information about the llvm-commits mailing list