[llvm] 0f20a35 - AMDGPU: Set up User SGPRs for queue_ptr only when necessary
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 9 10:15:02 PST 2022
Author: Changpeng Fang
Date: 2022-03-09T10:14:05-08:00
New Revision: 0f20a35b9e4bf4fdc4fb5ff2b5f9beee48081e9c
URL: https://github.com/llvm/llvm-project/commit/0f20a35b9e4bf4fdc4fb5ff2b5f9beee48081e9c
DIFF: https://github.com/llvm/llvm-project/commit/0f20a35b9e4bf4fdc4fb5ff2b5f9beee48081e9c.diff
LOG: AMDGPU: Set up User SGPRs for queue_ptr only when necessary
Summary:
In general, we need queue_ptr for aperture bases and trap handling,
and user SGPRs have to be set up to hold queue_ptr. In current implementation,
user SGPRs are set up unnecessarily for some cases. If the target has aperture
registers, queue_ptr is not needed to reference aperture bases. For trap
handling, if target suppots getDoorbellID, queue_ptr is also not necessary.
Futher, code object version 5 introduces new kernel ABI which passes queue_ptr
as an implicit kernel argument, so user SGPRs are no longer necessary for
queue_ptr. Based on the trap handling document:
https://llvm.org/docs/AMDGPUUsage.html#amdgpu-trap-handler-for-amdhsa-os-v4-onwards-table,
llvm.debugtrap does not need queue_ptr, we remove queue_ptr suport for llvm.debugtrap
in the backend.
Reviewers: sameerds, arsenm
Fixes: SWDEV-307189
Differential Revision: https://reviews.llvm.org/D119762
Added:
llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll
llvm/test/CodeGen/AMDGPU/kernarg-size.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
llvm/test/CodeGen/AMDGPU/trap-abis.ll
llvm/test/CodeGen/AMDGPU/trap.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b7a16fde574bd..749faa51c8c2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -393,7 +393,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
- if (MFI.hasQueuePtr()) {
+ if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
@@ -1090,7 +1090,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (MFI->hasQueuePtr())
+ if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (MFI->hasKernargSegmentPtr())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 1d5b4ba672dbd..46748c9365cea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -50,7 +50,9 @@ static constexpr std::pair<ImplicitArgumentMask,
// TODO: We should not add the attributes if the known compile time workgroup
// size is 1 for y/z.
static ImplicitArgumentMask
-intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
+intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
+ bool HasApertureRegs, bool SupportsGetDoorBellID) {
+ unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
NonKernelOnly = true;
@@ -76,13 +78,23 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
return DISPATCH_ID;
case Intrinsic::amdgcn_implicitarg_ptr:
return IMPLICIT_ARG_PTR;
+ // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
+ // queue_ptr.
case Intrinsic::amdgcn_queue_ptr:
+ NeedsImplicit = (CodeObjectVersion == 5);
+ return QUEUE_PTR;
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
- // TODO: Does not require the queue pointer on gfx9+
+ if (HasApertureRegs)
+ return NOT_IMPLICIT_INPUT;
+ // Under V5, we need implicitarg_ptr + offsets to access private_base or
+ // shared_base. For pre-V5, however, need to access them through queue_ptr +
+ // offsets.
+ return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
case Intrinsic::trap:
- case Intrinsic::debugtrap:
- IsQueuePtr = true;
+ if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
+ return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
+ NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
return QUEUE_PTR;
default:
return NOT_IMPLICIT_INPUT;
@@ -129,6 +141,12 @@ class AMDGPUInformationCache : public InformationCache {
return ST.hasApertureRegs();
}
+ /// Check if the subtarget supports GetDoorbellID.
+ bool supportsGetDoorbellID(Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.supportsGetDoorbellID();
+ }
+
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getFlatWorkGroupSizes(F);
@@ -381,7 +399,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
- bool NeedsQueuePtr = false;
+ bool NeedsImplicit = false;
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
+ bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
for (Function *Callee : AAEdges.getOptimisticEdges()) {
Intrinsic::ID IID = Callee->getIntrinsicID();
@@ -394,19 +415,25 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
bool NonKernelOnly = false;
ImplicitArgumentMask AttrMask =
- intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr);
+ intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
+ HasApertureRegs, SupportsGetDoorbellID);
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
}
}
- if (!NeedsQueuePtr) {
- NeedsQueuePtr = checkForQueuePtr(A);
- }
+ // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
+ if (NeedsImplicit)
+ removeAssumedBits(IMPLICIT_ARG_PTR);
- if (NeedsQueuePtr) {
- removeAssumedBits(QUEUE_PTR);
+ if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
+ // Under V5, we need implicitarg_ptr + offsets to access private_base or
+ // shared_base. We do not actually need queue_ptr.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
+ removeAssumedBits(IMPLICIT_ARG_PTR);
+ else
+ removeAssumedBits(QUEUE_PTR);
}
if (funcRetrievesHostcallPtr(A)) {
@@ -419,6 +446,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
removeAssumedBits(HEAP_PTR);
}
+ if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
+ assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
+ removeAssumedBits(QUEUE_PTR);
+ }
+
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
: ChangeStatus::UNCHANGED;
}
@@ -515,6 +547,14 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
return funcRetrievesImplicitKernelArg(A, OAS);
}
+ bool funcRetrievesQueuePtr(Attributor &A) {
+ if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+ return false;
+ auto Pos = llvm::AMDGPU::getQueuePtrImplicitArgPosition();
+ AAPointerInfo::OffsetAndSize OAS(Pos, 8);
+ return funcRetrievesImplicitKernelArg(A, OAS);
+ }
+
bool funcRetrievesImplicitKernelArg(Attributor &A,
AAPointerInfo::OffsetAndSize OAS) {
// Check if this is a call to the implicitarg_ptr builtin and it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 96166e62f7c5b..05468141ea5b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -453,7 +453,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr()) {
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 23f9ea82e49eb..3b5881a53ea18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -1043,8 +1043,9 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
Offset += 72; // Reserved.
- // hidden_private_base and hidden_shared_base are only used by GFX8.
- if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // hidden_private_base and hidden_shared_base are only when the subtarget has
+ // ApertureRegs.
+ if (!ST.hasApertureRegs()) {
emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
} else
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66bc7b7c9b0cb..bfd641fdd9e74 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2115,7 +2115,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (Info.hasQueuePtr())
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2162,7 +2162,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr()) {
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c57410f7016cf..ce034dca3691e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -163,6 +163,13 @@ unsigned getHeapPtrImplicitArgPosition() {
return 0;
}
+unsigned getQueuePtrImplicitArgPosition() {
+ if (AmdhsaCodeObjectVersion == 5)
+ return 200;
+ llvm_unreachable("queue_ptr is supported only by code object version 5");
+ return 0;
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index abb964b4c67ce..5526d18b2dcc8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -60,6 +60,9 @@ unsigned getHostcallImplicitArgPosition();
/// \returns The offset of the heap ptr argument from implicitarg_ptr
unsigned getHeapPtrImplicitArgPosition();
+/// \returns The offset of the queue ptr argument from implicitarg_ptr
+unsigned getQueuePtrImplicitArgPosition();
+
/// \returns Code object version.
unsigned getAmdhsaCodeObjectVersion();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 4135cdce82c65..e17ee5910a253 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
;
; GFX9-LABEL: is_private_vgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc
@@ -40,7 +40,7 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
;
; GFX10-LABEL: is_private_vgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
@@ -79,7 +79,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
;
; GFX9-LABEL: is_private_sgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
@@ -94,7 +94,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
;
; GFX10-LABEL: is_private_sgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index b5fbad0b3c94d..5e67726d02d13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
;
; GFX9-LABEL: is_local_vgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc
@@ -40,7 +40,7 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
;
; GFX10-LABEL: is_local_vgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
@@ -79,7 +79,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
;
; GFX9-LABEL: is_local_sgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
@@ -94,7 +94,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
;
; GFX10-LABEL: is_local_sgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
index 5d421729464a2..1f3462cdece1a 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
@@ -93,7 +93,7 @@
; GFX8-NEXT: - .offset: 220
; GFX8-NEXT: .size: 4
; GFX8-NEXT: .value_kind: hidden_shared_base
-; CHECK-NEXT: - .address_space: global
+; CHECK: - .address_space: global
; CHECK-NEXT: .offset: 224
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_queue_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll
index e1ffd338e5b75..09a0142da7eb7 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll
@@ -24,9 +24,16 @@ define amdgpu_kernel void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %p
}
; CHECK: - .args:
-; CHECK: .offset: 208
-; CHECK-NEXT: .size: 8
-; CHECK-NEXT: .value_kind: hidden_queue_ptr
+; CHECK: .value_kind: hidden_multigrid_sync_arg
+; PRE-GFX9: .offset: 200
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_private_base
+; PRE-GFX9-NEXT: .offset: 204
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_shared_base
+; GFX9-NOT: .value_kind: hidden_multigrid_sync_arg
+; GFX9-NOT: .value_kind: hidden_private_base
+; CKECK-NOT: .value_kind: hidden_queue_ptr
; CHECK: .name: is_shared_requires_queue_ptr
; CHECK: .symbol: is_shared_requires_queue_ptr.kd
define amdgpu_kernel void @is_shared_requires_queue_ptr(i8* %ptr) {
@@ -37,9 +44,16 @@ define amdgpu_kernel void @is_shared_requires_queue_ptr(i8* %ptr) {
}
; CHECK: - .args:
-; CHECK: .offset: 208
-; CHECK-NEXT: .size: 8
-; CHECK-NEXT: .value_kind: hidden_queue_ptr
+; CHECK: .value_kind: hidden_multigrid_sync_arg
+; PRE-GFX9: .offset: 200
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_private_base
+; PRE-GFX9-NEXT: .offset: 204
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_shared_base
+; GFX9-NOT: .value_kind: hidden_private_base
+; GFX9-NOT: .value_kind: hidden_shared_base
+; CKECK-NOT: .value_kind: hidden_queue_ptr
; CHECK: .name: is_private_requires_queue_ptr
; CHECK: .symbol: is_private_requires_queue_ptr.kd
define amdgpu_kernel void @is_private_requires_queue_ptr(i8* %ptr) {
@@ -50,9 +64,20 @@ define amdgpu_kernel void @is_private_requires_queue_ptr(i8* %ptr) {
}
; CHECK: - .args:
-; CHECK: .offset: 200
-; CHECK-NEXT: .size: 8
-; CHECK-NEXT: .value_kind: hidden_queue_ptr
+; CHECK: .value_kind: hidden_multigrid_sync_arg
+; PRE-GFX9: .offset: 192
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_private_base
+; PRE-GFX9-NEXT: .offset: 196
+; PRE-GFX9-NEXT: .size: 4
+; PRE-GFX9-NEXT: .value_kind: hidden_shared_base
+; PRE-GFX9-NEXT: .address_space: global
+; PRE-GFX9-NEXT: .offset: 200
+; PRE-GFX9-NEXT: .size: 8
+; PRE-GFX9-NEXT: .value_kind: hidden_queue_ptr
+; GFX9-NOT: .value_kind: hidden_private_base
+; GFX9-NOT: .value_kind: hidden_shared_base
+; GFX9-NOT: .value_kind: hidden_queue_ptr
; CHECK: .name: trap_requires_queue_ptr
; CHECK: .symbol: trap_requires_queue_ptr.kd
define amdgpu_kernel void @trap_requires_queue_ptr() {
@@ -60,17 +85,6 @@ define amdgpu_kernel void @trap_requires_queue_ptr() {
unreachable
}
-; CHECK: - .args:
-; CHECK: .offset: 200
-; CHECK-NEXT: .size: 8
-; CHECK-NEXT: .value_kind: hidden_queue_ptr
-; CHECK: .name: debugtrap_requires_queue_ptr
-; CHECK: .symbol: debugtrap_requires_queue_ptr.kd
-define amdgpu_kernel void @debugtrap_requires_queue_ptr() {
- call void @llvm.debugtrap()
- unreachable
-}
-
; CHECK: - .args:
; CHECK: .offset: 208
; CHECK-NEXT: .size: 8
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll
new file mode 100644
index 0000000000000..b2230b4a8321f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll
@@ -0,0 +1,301 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+
+declare void @function1()
+
+declare void @function2() #0
+
+; Function Attrs: noinline
+define void @function3(i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink) #2 {
+ store i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink, align 8
+ ret void
+}
+
+; Function Attrs: noinline
+define void @function4(i64 %arg, i64* %a) #2 {
+ store i64 %arg, i64* %a
+ ret void
+}
+
+; Function Attrs: noinline
+define void @function5(i8 addrspace(4)* %ptr, i64* %sink) #2 {
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 168
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ store i64 %x, i64* %sink
+ ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+
+; CHECK: amdhsa.kernels:
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel10
+define amdgpu_kernel void @test_kernel10(i8* %a) {
+ store i8 3, i8* %a, align 1
+ ret void
+}
+
+; Call to an extern function
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel20
+define amdgpu_kernel void @test_kernel20(i8* %a) {
+ call void @function1()
+ store i8 3, i8* %a, align 1
+ ret void
+}
+
+; Explicit attribute on kernel
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel21
+define amdgpu_kernel void @test_kernel21(i8* %a) #0 {
+ call void @function1()
+ store i8 3, i8* %a, align 1
+ ret void
+}
+
+; Explicit attribute on extern callee
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel22
+define amdgpu_kernel void @test_kernel22(i8* %a) {
+ call void @function2()
+ store i8 3, i8* %a, align 1
+ ret void
+}
+
+; Access more bytes than the pointer size
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel30
+define amdgpu_kernel void @test_kernel30(i128* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192
+ %cast = bitcast i8 addrspace(4)* %gep to i128 addrspace(4)*
+ %x = load i128, i128 addrspace(4)* %cast
+ store i128 %x, i128* %a
+ ret void
+}
+
+; Typical load of queue pointer
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel40
+define amdgpu_kernel void @test_kernel40(i64* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ store i64 %x, i64* %a
+ ret void
+}
+
+; Typical usage, overriden by explicit attribute on kernel
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel41
+define amdgpu_kernel void @test_kernel41(i64* %a) #0 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ store i64 %x, i64* %a
+ ret void
+}
+
+; Access to implicit arg before the queue pointer
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel42
+define amdgpu_kernel void @test_kernel42(i64* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ store i64 %x, i64* %a
+ ret void
+}
+
+; Access to implicit arg after the queue pointer
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel43
+define amdgpu_kernel void @test_kernel43(i64* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ store i64 %x, i64* %a
+ ret void
+}
+
+; Accessing a byte just before the queue pointer
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel44
+define amdgpu_kernel void @test_kernel44(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 199
+ %x = load i8, i8 addrspace(4)* %gep, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Accessing a byte inside the queue pointer
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel45
+define amdgpu_kernel void @test_kernel45(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
+ %x = load i8, i8 addrspace(4)* %gep, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Accessing a byte inside the queue pointer
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel46
+define amdgpu_kernel void @test_kernel46(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 207
+ %x = load i8, i8 addrspace(4)* %gep, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Accessing a byte just after the queue pointer
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel47
+define amdgpu_kernel void @test_kernel47(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208
+ %x = load i8, i8 addrspace(4)* %gep, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Access with an unknown offset
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel50
+define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 %b
+ %x = load i8, i8 addrspace(4)* %gep, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Multiple geps reaching the queue pointer argument.
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel51
+define amdgpu_kernel void @test_kernel51(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
+ %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 184
+ %x = load i8, i8 addrspace(4)* %gep2, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Multiple geps not reaching the queue pointer argument.
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel52
+define amdgpu_kernel void @test_kernel52(i8* %a) {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
+ %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 16
+ %x = load i8, i8 addrspace(4)* %gep2, align 1
+ store i8 %x, i8* %a, align 1
+ ret void
+}
+
+; Queue pointer used inside a function call
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel60
+define amdgpu_kernel void @test_kernel60(i64* %a) #2 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200
+ %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
+ %x = load i64, i64 addrspace(4)* %cast
+ call void @function4(i64 %x, i64* %a)
+ ret void
+}
+
+; Queue pointer retrieved inside a function call; chain of geps
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel61
+define amdgpu_kernel void @test_kernel61(i64* %a) #2 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 32
+ call void @function5(i8 addrspace(4)* %gep, i64* %a)
+ ret void
+}
+
+; Pointer captured
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel70
+define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #2 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
+ store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink, align 8
+ ret void
+}
+
+; Pointer captured inside function call
+
+; CHECK: - .args:
+; CHECK: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel71
+define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #2 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
+ call void @function3(i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink)
+ ret void
+}
+
+; Ineffective pointer capture
+
+; CHECK: - .args:
+; CHECK-NOT: hidden_queue_ptr
+; CHECK-LABEL: .name: test_kernel72
+define amdgpu_kernel void @test_kernel72() #2 {
+ %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
+ store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* undef, align 8
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-queue-ptr" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index d9fdfb0d101af..fd195f9b790eb 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -1,9 +1,8 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=HSA %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=HSA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=DOORBELL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=DOORBELL %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=HSA %s
declare void @llvm.trap() #0
-declare void @llvm.debugtrap() #1
; HSA: .amdhsa_kernel trap
; HSA-NEXT: .amdhsa_group_segment_fixed_size 0
@@ -13,6 +12,14 @@ declare void @llvm.debugtrap() #1
; HSA-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; HSA: .end_amdhsa_kernel
+; DOORBELL: .amdhsa_kernel trap
+; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
+; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
+; DOORBELL-NEXT: .amdhsa_kernarg_size 8
+; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6
+; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+; DOORBELL: .end_amdhsa_kernel
+
define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
store volatile i32 1, i32 addrspace(1)* %arg0
call void @llvm.trap()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 9be69a9fb0861..f5c137a056b73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
-; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
+; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}}
; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 7948f0c0a18b9..f98676b96439d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
; GFX9-DAG: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
-; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
+; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}}
; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
; GCN: s_cbranch_vccnz
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 5e669cded16fe..a64507ca5861c 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -38,7 +38,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
;
; NOHSA-TRAP-GFX900-V4-LABEL: trap:
; NOHSA-TRAP-GFX900-V4: ; %bb.0:
-; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
@@ -242,7 +242,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
;
; HSA-TRAP-GFX900-V4-LABEL: trap:
; HSA-TRAP-GFX900-V4: ; %bb.0:
-; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
@@ -340,7 +340,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
;
; HSA-NOTRAP-GFX900-V4-LABEL: trap:
; HSA-NOTRAP-GFX900-V4: ; %bb.0:
-; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
@@ -391,7 +391,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
;
; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
@@ -643,7 +643,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
;
; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
; HSA-TRAP-GFX900-V4: ; %bb.0: ; %entry
-; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
@@ -762,7 +762,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
;
; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap:
; HSA-NOTRAP-GFX900-V4: ; %bb.0: ; %entry
-; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
@@ -793,7 +793,7 @@ ret:
define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap:
; NOHSA-TRAP-GFX900-V2: ; %bb.0:
-; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2
@@ -806,7 +806,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap:
; NOHSA-TRAP-GFX900-V3: ; %bb.0:
-; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
@@ -819,7 +819,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap:
; NOHSA-TRAP-GFX900-V4: ; %bb.0:
-; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
@@ -852,7 +852,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX803-V2-NEXT: enable_mem_ordered = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_fwd_progress = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 8
+; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 6
; HSA-TRAP-GFX803-V2-NEXT: enable_trap_handler = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_x = 1
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_y = 0
@@ -864,7 +864,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX803-V2-NEXT: enable_exception = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_buffer = 1
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_ptr = 0
-; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 1
+; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_id = 0
; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_flat_scratch_init = 0
@@ -884,7 +884,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX803-V2-NEXT: gds_segment_byte_size = 0
; HSA-TRAP-GFX803-V2-NEXT: kernarg_segment_byte_size = 8
; HSA-TRAP-GFX803-V2-NEXT: workgroup_fbarrier_count = 0
-; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 8
+; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 6
; HSA-TRAP-GFX803-V2-NEXT: workitem_vgpr_count = 4
; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_first = 0
; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_count = 0
@@ -900,7 +900,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX803-V2-NEXT: runtime_loader_kernel_symbol = 0
; HSA-TRAP-GFX803-V2-NEXT: .end_amd_kernel_code_t
; HSA-TRAP-GFX803-V2-NEXT: ; %bb.0:
-; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v2, 1
; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-V2-NEXT: s_waitcnt lgkmcnt(0)
@@ -915,7 +915,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-TRAP-GFX803-V3-LABEL: debugtrap:
; HSA-TRAP-GFX803-V3: ; %bb.0:
-; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1
; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0)
@@ -930,7 +930,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-TRAP-GFX803-V4-LABEL: debugtrap:
; HSA-TRAP-GFX803-V4: ; %bb.0:
-; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1
; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0)
@@ -965,7 +965,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 8
+; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 6
; HSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0
@@ -977,7 +977,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX900-V2-NEXT: enable_exception = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0
-; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1
+; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0
; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0
@@ -997,7 +997,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0
; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8
; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0
-; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8
+; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6
; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3
; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0
; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0
@@ -1013,7 +1013,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0
; HSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t
; HSA-TRAP-GFX900-V2-NEXT: ; %bb.0:
-; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1
; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2
@@ -1027,7 +1027,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-TRAP-GFX900-V3-LABEL: debugtrap:
; HSA-TRAP-GFX900-V3: ; %bb.0:
-; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
@@ -1041,7 +1041,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-TRAP-GFX900-V4-LABEL: debugtrap:
; HSA-TRAP-GFX900-V4: ; %bb.0:
-; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
@@ -1075,7 +1075,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-NOTRAP-GFX900-V2-NEXT: enable_mem_ordered = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_fwd_progress = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 8
+; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 6
; HSA-NOTRAP-GFX900-V2-NEXT: enable_trap_handler = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0
@@ -1087,7 +1087,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-NOTRAP-GFX900-V2-NEXT: enable_exception = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0
-; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1
+; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0
; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0
@@ -1107,7 +1107,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0
; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8
; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8
+; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6
; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3
; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0
; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0
@@ -1123,7 +1123,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
; HSA-NOTRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0
; HSA-NOTRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t
; HSA-NOTRAP-GFX900-V2-NEXT: ; %bb.0:
-; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1
; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2
@@ -1136,7 +1136,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap:
; HSA-NOTRAP-GFX900-V3: ; %bb.0:
-; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
@@ -1149,7 +1149,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
;
; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap:
; HSA-NOTRAP-GFX900-V4: ; %bb.0:
-; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 0998b09877859..64f4064610468 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @hsa_trap(i32 addrspace(1)* nocapture readonly %arg0)
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long 208
+; MESA-TRAP-NEXT: .long 204
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long 144
+; NOMESA-TRAP-NEXT: .long 140
; GCN-LABEL: {{^}}hsa_debugtrap:
; HSA-TRAP: enable_trap_handler = 0
More information about the llvm-commits
mailing list