[llvm] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using implicit kernel argument if feasible (PR #103490)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 13 20:45:57 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/103490
>From f3ca3ef4fca1c3f1b81278bc3823a791fb5efe2b Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 13 Aug 2024 20:33:19 -0400
Subject: [PATCH] [AMDGPU] Lower `llvm.amdgcn.queue.ptr` instrinsic to using
implicit kernel argument if feasible
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 31 ++++++++++++++++++-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 ++++-
...licit-kernarg-backend-usage-global-isel.ll | 21 ++++++++-----
.../abi-attribute-hints-undefined-behavior.ll | 18 +++++++----
.../callee-special-input-sgprs-fixed-abi.ll | 3 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 22 ++++++++-----
7 files changed, 81 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c6c4b8f9306471..c0c842382622c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4456,6 +4456,27 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
return true;
}
+/// Legalize a value that's loaded from implicit kernel arguments.
+bool AMDGPULegalizerInfo::legalizeImplicitKernelargParameterPtr(
+ MachineInstr &MI, MachineIRBuilder &B, LLT Ty, unsigned Offset) const {
+ MachineFunction &MF = *MI.getMF();
+ Module *M = MF.getFunction().getParent();
+
+ assert(AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5);
+
+ Register Ptr = getKernargParameterPtr(B, Offset);
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ Ty, commonAlignment(Align(Ty.getSizeInBits()), Offset));
+ Register Temp = B.buildLoad(Ty, Ptr, *MMO).getReg(0);
+ B.buildCopy(MI.getOperand(0).getReg(), Temp);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -7312,9 +7333,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_dispatch_ptr:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_PTR);
- case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::amdgcn_queue_ptr: {
+ MachineFunction &MF = *MI.getMF();
+ Module *M = MF.getFunction().getParent();
+ if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+ uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset(
+ B.getMF(), AMDGPUTargetLowering::QUEUE_PTR);
+ return legalizeImplicitKernelargParameterPtr(MI, B, S64, Offset);
+ }
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::QUEUE_PTR);
+ }
case Intrinsic::amdgcn_implicit_buffer_ptr:
return legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index db1c5874093a71..8aa83eeee420ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -131,6 +131,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
uint64_t Offset,
Align Alignment = Align(4)) const;
+ bool legalizeImplicitKernelargParameterPtr(MachineInstr &MI,
+ MachineIRBuilder &B, LLT Ty,
+ unsigned Offset) const;
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86fc100f1c2da0..52f93472eac206 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
- case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
+ return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
+ [[fallthrough]];
+ }
+ case Intrinsic::amdgcn_dispatch_ptr: {
if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9443b39dcdc033..0534623539a5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -295,14 +295,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_add_u32 s0, s6, 8
-; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT: s_add_u32 s2, s6, 8
+; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -336,15 +340,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX9V5: ; %bb.0:
+; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
-; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
-; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..d2f8487f9d13d5 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -207,6 +207,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_func_use_other_sgpr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-NEXT: s_mov_b64 s[6:7], 0xc8
+; FIXEDABI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s7
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
@@ -238,12 +241,15 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
-; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
-; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
-; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
+; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd0
+; FIXEDABI-NEXT: s_add_u32 s2, s4, 8
+; FIXEDABI-NEXT: s_addc_u32 s3, s5, 0
+; FIXEDABI-NEXT: v_mov_b32_e32 v0, s2
+; FIXEDABI-NEXT: v_mov_b32_e32 v1, s3
+; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
+; FIXEDABI-NEXT: v_mov_b32_e32 v3, s1
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s0
+; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..c2c6670c3ecacd 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
}
; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
+; GCN: s_mov_b64 s[4:5], 0xc8
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
define hidden void @use_queue_ptr() #1 {
%queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
%value = load volatile i32, ptr addrspace(4) %queue_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..8999b8f0d07678 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_add_u32 s0, s6, 8
-; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
+; GFX8V5-NEXT: s_add_u32 s2, s6, 8
+; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
+; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
+; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
+; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
+; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
+; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
-; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX9V5: ; %bb.0:
+; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
+; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
-; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list