[llvm] a047147 - [AMDGPU] Add a trap lowering workaround for gfx11 (#85854)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 24 06:43:58 PDT 2024
Author: Emma Pilkington
Date: 2024-04-24T09:43:54-04:00
New Revision: a04714701f17663e3e9879a50ed07bbb421ca209
URL: https://github.com/llvm/llvm-project/commit/a04714701f17663e3e9879a50ed07bbb421ca209
DIFF: https://github.com/llvm/llvm-project/commit/a04714701f17663e3e9879a50ed07bbb421ca209.diff
LOG: [AMDGPU] Add a trap lowering workaround for gfx11 (#85854)
On gfx11 shaders run with PRIV=1, which causes `s_trap 2` to be treated
as a nop, which means it isn't a correct lowering for the trap
intrinsic. As a workaround, this commit instead lowers the trap
intrinsic to instructions that simulate the behavior of s_trap 2.
Fixes: SWDEV-438421
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/trap-abis.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5c2c6d4b13c669..2b81f5d51032dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -307,6 +307,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
"MSAA loads not honoring dst_sel bug"
>;
+def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
+ "HasPrivEnabledTrap2NopBug",
+ "true",
+ "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -1487,13 +1493,15 @@ def FeatureISAVersion11_Generic: FeatureSet<
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureUserSGPRInit16Bug,
+ FeaturePrivEnabledTrap2NopBug,
FeatureRequiresCOV6])>;
def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
- FeatureMADIntraFwdBug])>;
+ FeatureMADIntraFwdBug,
+ FeaturePrivEnabledTrap2NopBug])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index db69d50799e70b..f4a747784d1fd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5377,6 +5377,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
NODE_NAME_CASE(ENDPGM_TRAP)
+ NODE_NAME_CASE(SIMULATED_TRAP)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f10a357125e562..72661a8d29f816 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -407,6 +407,9 @@ enum NodeType : unsigned {
// s_endpgm, but we may want to insert it in the middle of the block.
ENDPGM_TRAP,
+ // "s_trap 2" equivalent on hardware that does not support it.
+ SIMULATED_TRAP,
+
// Return to a shader part's epilog code.
RETURN_TO_EPILOG,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82f58ea38fd0a7..702f6e67c55271 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
[SDNPHasChain]>;
+def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
+ [SDNPHasChain]>;
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index de029a4d6bedd2..6cd93abff1a429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6725,8 +6725,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
return true;
}
-bool AMDGPULegalizerInfo::legalizeTrapHsa(
- MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // We need to simulate the 's_trap 2' instruction on targets that run in
+ // PRIV=1 (where it is treated as a nop).
+ if (ST.hasPrivEnabledTrap2NopBug()) {
+ ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
+ MI.getDebugLoc());
+ MI.eraseFromParent();
+ return true;
+ }
+
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 2ca5ae306b11be..be337e0b2192f2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -224,6 +224,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
bool HasMSAALoadDstSelBug = false;
+ bool HasPrivEnabledTrap2NopBug = false;
bool Has1_5xVGPRs = false;
bool HasMADIntraFwdBug = false;
bool HasVOPDInsts = false;
@@ -1032,6 +1033,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
+ bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
+
bool hasNSAEncoding() const { return HasNSAEncoding; }
bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 50159d435aef7e..cb4efdc7cf657c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5405,6 +5405,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SIMULATED_TRAP: {
+ assert(Subtarget->hasPrivEnabledTrap2NopBug());
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineBasicBlock *SplitBB =
+ TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
+ MI.eraseFromParent();
+ return SplitBB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
@@ -6623,6 +6631,11 @@ SDValue SITargetLowering::lowerTrapHsa(
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
+ // We need to simulate the 's_trap 2' instruction on targets that run in
+ // PRIV=1 (where it is treated as a nop).
+ if (Subtarget->hasPrivEnabledTrap2NopBug())
+ return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
+
uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
Chain,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6102f2e8929a5f..e20fe1b716b647 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2026,6 +2026,57 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
}
}
+MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
+ MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ const DebugLoc &DL) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+ MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
+ MF->push_back(HaltLoop);
+
+ constexpr unsigned DoorbellIDMask = 0x3ff;
+ constexpr unsigned ECQueueWaveAbort = 0x400;
+
+ // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
+ // will be a nop.
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
+ Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
+ .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
+ .addUse(AMDGPU::M0);
+ Register DoorbellRegMasked =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
+ .addUse(DoorbellReg)
+ .addImm(DoorbellIDMask);
+ Register SetWaveAbortBit =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
+ .addUse(DoorbellRegMasked)
+ .addImm(ECQueueWaveAbort);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addUse(SetWaveAbortBit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addUse(AMDGPU::TTMP2);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
+
+ BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
+ BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
+ .addMBB(HaltLoop);
+
+ if (SplitBB != &MBB)
+ MBB.removeSuccessor(SplitBB);
+ MBB.addSuccessor(HaltLoop);
+ HaltLoop->addSuccessor(HaltLoop);
+
+ return SplitBB;
+}
+
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4c5978cdc6665c..b314b9b2fb5135 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1194,6 +1194,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Quantity) const override;
void insertReturn(MachineBasicBlock &MBB) const;
+
+ /// Build instructions that simulate the behavior of a `s_trap 2` instructions
+ /// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is
+ /// interpreted as a nop.
+ MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI,
+ MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ const DebugLoc &DL) const;
+
/// Return the number of wait states that result from executing this
/// instruction.
static unsigned getNumWaitStates(const MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d6d49889656bbc..cca8d96f29c0fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI<
let usesCustomInserter = 1;
}
+def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)],
+ "SIMULATED_TRAP"> {
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
def ATOMIC_FENCE : SPseudoInstSI<
(outs), (ins i32imm:$ordering, i32imm:$scope),
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
new file mode 100644
index 00000000000000..ac98dca00be3df
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -0,0 +1,49 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
+
+---
+name: test_trap
+body: |
+ bb.0:
+ ; GFX1100-LABEL: name: test_trap
+ ; GFX1100: successors: %bb.2(0x80000000)
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+ ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+ ; GFX1100-NEXT: S_TRAP 2
+ ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
+ ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
+ ; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
+ ; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
+ ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
+ ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
+ ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
+ ; GFX1100-NEXT: S_BRANCH %bb.2
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: .1:
+ ; GFX1100-NEXT: successors:
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: .2:
+ ; GFX1100-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: S_SETHALT 5
+ ; GFX1100-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX1150-LABEL: name: test_trap
+ ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+ ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+ ; GFX1150-NEXT: S_TRAP 2
+ ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(p1) = G_CONSTANT i64 0
+ G_STORE %0, %1 :: (store 1, addrspace 1)
+ G_TRAP
+ G_STORE %0, %1 :: (store 1, addrspace 1)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 3cd6c98ef4b8e0..dcc5fbd142c427 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -3,6 +3,8 @@
; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s
+; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s
declare void @llvm.trap() #0
declare void @llvm.debugtrap() #1
@@ -49,6 +51,48 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: trap:
+; HSA-TRAP-GFX1100: ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT: s_trap 2
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT: s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT: s_branch .LBB0_1
+;
+; HSA-TRAP-GFX1100-O0-LABEL: trap:
+; HSA-TRAP-GFX1100-O0: ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB0_1
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.trap()
unreachable
@@ -128,6 +172,84 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-NOTRAP-GFX900-NEXT: s_endpgm
; HSA-NOTRAP-GFX900-NEXT: .LBB1_2: ; %trap
; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: non_entry_trap:
+; HSA-TRAP-GFX1100: ; %bb.0: ; %entry
+; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v1
+; HSA-TRAP-GFX1100-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX1100-NEXT: ; %bb.1: ; %ret
+; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT: s_nop 0
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT: s_endpgm
+; HSA-TRAP-GFX1100-NEXT: .LBB1_2: ; %trap
+; HSA-TRAP-GFX1100-NEXT: s_trap 2
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT: s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT: s_branch .LBB1_3
+;
+; HSA-TRAP-GFX1100-O0-LABEL: non_entry_trap:
+; HSA-TRAP-GFX1100-O0: ; %bb.0: ; %entry
+; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
+; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 s0, -1
+; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $sgpr1
+; HSA-TRAP-GFX1100-O0-NEXT: v_cmp_eq_u32_e64 s0, v0, s0
+; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_vccnz .LBB1_2
+; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: ; %trap
+; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB1_3
+; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_2: ; %ret
+; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1
+; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 0
+; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0
+; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm
+; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB1_3
entry:
%tmp29 = load volatile i32, ptr addrspace(1) %arg0
%cmp = icmp eq i32 %tmp29, -1
@@ -197,6 +319,21 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: debugtrap:
+; HSA-TRAP-GFX1100: ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT: s_trap 3
+; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT: s_nop 0
+; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT: s_endpgm
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.debugtrap()
store volatile i32 2, ptr addrspace(1) %arg0
More information about the llvm-commits
mailing list