[llvm] [AMDGPU] Add a trap lowering workaround for gfx11 (PR #85854)

Emma Pilkington via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 25 07:51:29 PDT 2024


https://github.com/epilk updated https://github.com/llvm/llvm-project/pull/85854

>From 5d6135cd918fde8e43b653eec7b5106e3779bcf5 Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95 at gmail.com>
Date: Tue, 19 Mar 2024 14:41:01 -0400
Subject: [PATCH 1/4] [AMDGPU] Add a trap lowering workaround for gfx11

On gfx11 shaders run with PRIV=1, which causes `s_trap 2` to be treated
as a nop, which means it isn't a correct lowering for the trap
intrinsic. As a workaround, this commit instead lowers the trap
intrinsic to instructions that simulate the behavior of s_trap 2.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  3 +
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |  2 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 ++-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 11 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 47 +++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  9 +++
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  6 ++
 .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 40 +++++++++++
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         | 68 +++++++++++++++++++
 11 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bee43b6c18c880..1a4711dc06c4bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5376,6 +5376,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(ENDPGM_TRAP)
+  NODE_NAME_CASE(SIMULATED_TRAP)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f10a357125e562..72661a8d29f816 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -407,6 +407,9 @@ enum NodeType : unsigned {
   // s_endpgm, but we may want to insert it in the middle of the block.
   ENDPGM_TRAP,
 
+  // "s_trap 2" equivalent on hardware that does not support it.
+  SIMULATED_TRAP,
+
   // Return to a shader part's epilog code.
   RETURN_TO_EPILOG,
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82f58ea38fd0a7..702f6e67c55271 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
     [SDNPHasChain, SDNPOptInGlue]>;
 def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
     [SDNPHasChain]>;
+def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
+    [SDNPHasChain]>;
 
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e55d1de01b4fd1..bb79c4a376e4e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6726,8 +6726,14 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
 
 bool AMDGPULegalizerInfo::legalizeTrapHsa(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
-  B.buildInstr(AMDGPU::S_TRAP)
-      .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
+  if (!ST.requiresSimulatedTrap()) {
+    B.buildInstr(AMDGPU::S_TRAP)
+        .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
+    MI.eraseFromParent();
+    return true;
+  }
+
+  ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc());
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4da10beabe3162..67868f273b91df 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -443,6 +443,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
   }
 
+  bool requiresSimulatedTrap() const { return getGeneration() == GFX11; }
+
   bool supportsGetDoorbellID() const {
     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
     return getGeneration() >= GFX9;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81a231f0cade64..174182a5eabba2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5409,6 +5409,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return SplitBB;
   }
+  case AMDGPU::SIMULATED_TRAP: {
+    assert(Subtarget->requiresSimulatedTrap());
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    MachineBasicBlock *SplitBB =
+        TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
+    MI.eraseFromParent();
+    return SplitBB;
+  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())
@@ -6627,6 +6635,9 @@ SDValue SITargetLowering::lowerTrapHsa(
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
 
+  if (Subtarget->requiresSimulatedTrap())
+    return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
+
   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
   SDValue Ops[] = {
     Chain,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f4b21b7dfac391..0d5928cc25e3b7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2026,6 +2026,53 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
   }
 }
 
+MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
+                                                    MachineBasicBlock &MBB,
+                                                    MachineInstr &MI,
+                                                    const DebugLoc &DL) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+  MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
+  MF->push_back(HaltLoop);
+
+  // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
+  // will be a nop.
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
+      .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
+  Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
+      .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
+      .addUse(AMDGPU::M0);
+  Register And0x3ff = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), And0x3ff)
+      .addUse(DoorbellReg)
+      .addImm(0x3ff);
+  Register SetWaveAbortBit =
+      MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
+      .addUse(And0x3ff)
+      .addImm(0x400);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addUse(SetWaveAbortBit);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
+      .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addUse(AMDGPU::TTMP2);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
+
+  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
+  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
+      .addMBB(HaltLoop);
+
+  if (SplitBB != &MBB)
+    MBB.removeSuccessor(SplitBB);
+  MBB.addSuccessor(HaltLoop);
+  HaltLoop->addSuccessor(HaltLoop);
+
+  return SplitBB;
+}
+
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4c5978cdc6665c..b314b9b2fb5135 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1194,6 +1194,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                    unsigned Quantity) const override;
 
   void insertReturn(MachineBasicBlock &MBB) const;
+
+  /// Build instructions that simulate the behavior of a `s_trap 2` instructions
+  /// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is
+  /// interpreted as a nop.
+  MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI,
+                                         MachineBasicBlock &MBB,
+                                         MachineInstr &MI,
+                                         const DebugLoc &DL) const;
+
   /// Return the number of wait states that result from executing this
   /// instruction.
   static unsigned getNumWaitStates(const MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1c942dcefdacea..3f9a048a329b2a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI<
   let usesCustomInserter = 1;
 }
 
+def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)],
+                                   "SIMULATED_TRAP"> {
+  let hasSideEffects = 1;
+  let usesCustomInserter = 1;
+}
+
 def ATOMIC_FENCE : SPseudoInstSI<
   (outs), (ins i32imm:$ordering, i32imm:$scope),
   [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
new file mode 100644
index 00000000000000..0795e2b68bec98
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GCN %s
+
+---
+name: test_trap
+body: |
+  bb.0:
+    ; GCN-LABEL: name: test_trap
+    ; GCN: successors: %bb.2(0x80000000)
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+    ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GCN-NEXT: S_TRAP 2
+    ; GCN-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
+    ; GCN-NEXT: $ttmp2 = S_MOV_B32 $m0
+    ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
+    ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
+    ; GCN-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
+    ; GCN-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: $m0 = S_MOV_B32 $ttmp2
+    ; GCN-NEXT: S_BRANCH %bb.2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: .1:
+    ; GCN-NEXT: successors:
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: .2:
+    ; GCN-NEXT: successors: %bb.2(0x80000000)
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_SETHALT 5
+    ; GCN-NEXT: S_BRANCH %bb.2
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(p1) = G_CONSTANT i64 0
+    G_STORE %0, %1 :: (store 1, addrspace 1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
+    G_STORE %0, %1 :: (store 1, addrspace 1)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 3cd6c98ef4b8e0..8d5d752925742b 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -3,6 +3,7 @@
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s
 
 declare void @llvm.trap() #0
 declare void @llvm.debugtrap() #1
@@ -49,6 +50,27 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
 ; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: trap:
+; HSA-TRAP-GFX1100:       ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT:    s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB0_1
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
@@ -128,6 +150,37 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
 ; HSA-NOTRAP-GFX900-NEXT:  .LBB1_2: ; %trap
 ; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: non_entry_trap:
+; HSA-TRAP-GFX1100:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX1100-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v1
+; HSA-TRAP-GFX1100-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX1100-NEXT:  ; %bb.1: ; %ret
+; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_nop 0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-NEXT:  .LBB1_2: ; %trap
+; HSA-TRAP-GFX1100-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT:    s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB1_3
 entry:
   %tmp29 = load volatile i32, ptr addrspace(1) %arg0
   %cmp = icmp eq i32 %tmp29, -1
@@ -197,6 +250,21 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v2, s[0:1]
 ; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: debugtrap:
+; HSA-TRAP-GFX1100:       ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_trap 3
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_nop 0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT:    s_endpgm
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
   store volatile i32 2, ptr addrspace(1) %arg0

>From 4f2c25e6e1345d890d04168a08136043c1d1047b Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95 at gmail.com>
Date: Thu, 21 Mar 2024 13:23:07 -0400
Subject: [PATCH 2/4] Add an -O0 test

---
 llvm/test/CodeGen/AMDGPU/trap-abis.ll | 69 +++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 8d5d752925742b..dcc5fbd142c427 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -4,6 +4,7 @@
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
 ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s
+; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s
 
 declare void @llvm.trap() #0
 declare void @llvm.debugtrap() #1
@@ -71,6 +72,27 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX1100-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
 ; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB0_1
+;
+; HSA-TRAP-GFX1100-O0-LABEL: trap:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB0_1
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
@@ -181,6 +203,53 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX1100-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
 ; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
 ; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB1_3
+;
+; HSA-TRAP-GFX1100-O0-LABEL: non_entry_trap:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v0, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v0, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 s0, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $sgpr1
+; HSA-TRAP-GFX1100-O0-NEXT:    v_cmp_eq_u32_e64 s0, v0, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_cbranch_vccnz .LBB1_2
+; HSA-TRAP-GFX1100-O0-NEXT:  ; %bb.1: ; %trap
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB1_3
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB1_2: ; %ret
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    ; kill: killed $vgpr0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB1_3
 entry:
   %tmp29 = load volatile i32, ptr addrspace(1) %arg0
   %cmp = icmp eq i32 %tmp29, -1

>From b575811e688b5b09630da5d49cb359b8f8863da6 Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95 at gmail.com>
Date: Fri, 22 Mar 2024 10:26:55 -0400
Subject: [PATCH 3/4] Address review comments

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 +++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  8 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 14 +++--
 .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 62 +++++++++++--------
 4 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bb79c4a376e4e2..8caac830673ff3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6724,16 +6724,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeTrapHsa(
-    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
-  if (!ST.requiresSimulatedTrap()) {
-    B.buildInstr(AMDGPU::S_TRAP)
-        .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
+bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
+                                          MachineRegisterInfo &MRI,
+                                          MachineIRBuilder &B) const {
+  if (ST.requiresSimulatedTrap()) {
+    ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
+                                           MI.getDebugLoc());
     MI.eraseFromParent();
     return true;
   }
 
-  ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc());
+  B.buildInstr(AMDGPU::S_TRAP)
+      .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 67868f273b91df..266255c16f192c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -23,6 +23,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/TargetParser.h"
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
@@ -443,7 +444,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
   }
 
-  bool requiresSimulatedTrap() const { return getGeneration() == GFX11; }
+  // True on hardware where 's_trap 2' is treated as a nop that must be
+  // simulated.
+  bool requiresSimulatedTrap() const {
+    AMDGPU::IsaVersion V = AMDGPU::getIsaVersion(getCPU());
+    return V.Major == 11 && V.Minor <= 3;
+  }
 
   bool supportsGetDoorbellID() const {
     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0d5928cc25e3b7..8f88a991eae967 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2035,6 +2035,9 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
   MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
   MF->push_back(HaltLoop);
 
+  constexpr unsigned DoorbellIDMask = 0x3ff;
+  constexpr unsigned ECQueueWaveAbort = 0x400;
+
   // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
   // will be a nop.
   BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
@@ -2044,15 +2047,16 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
       .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
   BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
       .addUse(AMDGPU::M0);
-  Register And0x3ff = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), And0x3ff)
+  Register DoorbellRegMasked =
+      MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
       .addUse(DoorbellReg)
-      .addImm(0x3ff);
+      .addImm(DoorbellIDMask);
   Register SetWaveAbortBit =
       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
-      .addUse(And0x3ff)
-      .addImm(0x400);
+      .addUse(DoorbellRegMasked)
+      .addImm(ECQueueWaveAbort);
   BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addUse(SetWaveAbortBit);
   BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
index 0795e2b68bec98..de3fedaf14f1fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -1,36 +1,44 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
 
 ---
 name: test_trap
 body: |
   bb.0:
-    ; GCN-LABEL: name: test_trap
-    ; GCN: successors: %bb.2(0x80000000)
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GCN-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-    ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
-    ; GCN-NEXT: S_TRAP 2
-    ; GCN-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
-    ; GCN-NEXT: $ttmp2 = S_MOV_B32 $m0
-    ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
-    ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
-    ; GCN-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
-    ; GCN-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
-    ; GCN-NEXT: $m0 = S_MOV_B32 $ttmp2
-    ; GCN-NEXT: S_BRANCH %bb.2
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: .1:
-    ; GCN-NEXT: successors:
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: .2:
-    ; GCN-NEXT: successors: %bb.2(0x80000000)
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_SETHALT 5
-    ; GCN-NEXT: S_BRANCH %bb.2
+    ; GFX1100-LABEL: name: test_trap
+    ; GFX1100: successors: %bb.2(0x80000000)
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: S_TRAP 2
+    ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
+    ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
+    ; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
+    ; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
+    ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
+    ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
+    ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
+    ; GFX1100-NEXT: S_BRANCH %bb.2
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .1:
+    ; GFX1100-NEXT: successors:
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .2:
+    ; GFX1100-NEXT: successors: %bb.2(0x80000000)
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: S_SETHALT 5
+    ; GFX1100-NEXT: S_BRANCH %bb.2
+    ;
+    ; GFX1150-LABEL: name: test_trap
+    ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+    ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1150-NEXT: S_TRAP 2
+    ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
     %0:_(s8) = G_CONSTANT i8 0
     %1:_(p1) = G_CONSTANT i64 0
     G_STORE %0, %1 :: (store 1, addrspace 1)

>From 258a604d7614d0a5c09385bb5e7ab7fa064c37ad Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95 at gmail.com>
Date: Mon, 25 Mar 2024 10:13:08 -0400
Subject: [PATCH 4/4] Use a feature, switch to G_TRAP

---
 llvm/lib/Target/AMDGPU/AMDGPU.td                      |  9 ++++++++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp        |  4 +++-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h                 | 11 +++--------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp             |  6 ++++--
 .../CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir |  2 +-
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 37dcfef3b2a3da..2d9916b6ec01f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -303,6 +303,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
   "MSAA loads not honoring dst_sel bug"
 >;
 
+def FeaturePrivEnabledBug : SubtargetFeature<"priv-enabled-bug",
+  "HasPrivEnabledBug",
+  "true",
+  "Hardware that runs with PRIV=1 workaround"
+>;
+
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
   "ldsbankcount"#Value,
   "LDSBankCount",
@@ -1489,7 +1495,8 @@ def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureMSAALoadDstSelBug,
      FeatureVALUTransUseHazard,
-     FeatureMADIntraFwdBug])>;
+     FeatureMADIntraFwdBug,
+     FeaturePrivEnabledBug])>;
 
 def FeatureISAVersion11_0_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_0_Common.Features,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8caac830673ff3..deba02e19667d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6727,7 +6727,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
                                           MachineRegisterInfo &MRI,
                                           MachineIRBuilder &B) const {
-  if (ST.requiresSimulatedTrap()) {
+  // We need to simulate the 's_trap 2' instruction on targets that run in
+  // PRIV=1 (where it is treated as a nop).
+  if (ST.hasPrivEnabledBug()) {
     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
                                            MI.getDebugLoc());
     MI.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 266255c16f192c..f230023a05f27c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -23,7 +23,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/TargetParser/TargetParser.h"
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
@@ -224,6 +223,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasImageStoreD16Bug = false;
   bool HasImageGather4D16Bug = false;
   bool HasMSAALoadDstSelBug = false;
+  bool HasPrivEnabledBug = false;
   bool Has1_5xVGPRs = false;
   bool HasMADIntraFwdBug = false;
   bool HasVOPDInsts = false;
@@ -444,13 +444,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
   }
 
-  // True on hardware where 's_trap 2' is treated as a nop that must be
-  // simulated.
-  bool requiresSimulatedTrap() const {
-    AMDGPU::IsaVersion V = AMDGPU::getIsaVersion(getCPU());
-    return V.Major == 11 && V.Minor <= 3;
-  }
-
   bool supportsGetDoorbellID() const {
     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
     return getGeneration() >= GFX9;
@@ -1034,6 +1027,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
 
+  bool hasPrivEnabledBug() const { return HasPrivEnabledBug; }
+
   bool hasNSAEncoding() const { return HasNSAEncoding; }
 
   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 174182a5eabba2..003be15fb73df2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5410,7 +5410,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     return SplitBB;
   }
   case AMDGPU::SIMULATED_TRAP: {
-    assert(Subtarget->requiresSimulatedTrap());
+    assert(Subtarget->hasPrivEnabledBug());
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     MachineBasicBlock *SplitBB =
         TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
@@ -6635,7 +6635,9 @@ SDValue SITargetLowering::lowerTrapHsa(
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
 
-  if (Subtarget->requiresSimulatedTrap())
+  // We need to simulate the 's_trap 2' instruction on targets that run in
+  // PRIV=1 (where it is treated as a nop).
+  if (Subtarget->hasPrivEnabledBug())
     return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
 
   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
index de3fedaf14f1fc..703a94bb48470c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -42,7 +42,7 @@ body: |
     %0:_(s8) = G_CONSTANT i8 0
     %1:_(p1) = G_CONSTANT i64 0
     G_STORE %0, %1 :: (store 1, addrspace 1)
-    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
+    G_TRAP
     G_STORE %0, %1 :: (store 1, addrspace 1)
 
 ...



More information about the llvm-commits mailing list