[llvm] [AMDGPU] Move architected SGPR implementation into isel (PR #79120)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 23 03:09:04 PST 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/79120

- Precommit tests
- Implement architected SGPR support directly in legalization/isel.
- Remove architected SGPR support from argument handling


>From 99bbf62b9abd46e8be842b56499894f241c8d9c6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 22 Jan 2024 16:05:29 +0000
Subject: [PATCH 1/3] Precommit tests

---
 .../lower-work-group-id-intrinsics-hsa.ll     | 145 ++++++++++++++++++
 ... => lower-work-group-id-intrinsics-pal.ll} |  19 +++
 2 files changed, 164 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
 rename llvm/test/CodeGen/AMDGPU/{lower-work-group-id-intrinsics.ll => lower-work-group-id-intrinsics-pal.ll} (81%)

diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
new file mode 100644
index 000000000000000..afcaaa7faff7bde
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
+
+define amdgpu_kernel void @workgroup_ids_kernel() {
+; GFX9-SDAG-LABEL: workgroup_ids_kernel:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_ids_kernel:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s8
+; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s4, 36
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-SDAG-NEXT:    s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[4:5], 0x0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, ttmp9
+; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s8
+; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s4, 36
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, ttmp9
+; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[14:15]
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call void @callee(i32 %idx) #0
+  ret void
+}
+
+declare void @callee(i32) #0
+
+define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-SDAG-LABEL: workgroup_ids_device_func:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_device_func:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
similarity index 81%
rename from llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
rename to llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index c732ff709425505..b058cec6f6d4630 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -75,6 +75,25 @@ define amdgpu_cs void @caller() {
 
 declare amdgpu_gfx void @callee(i32)
 
+define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-SDAG-LABEL: workgroup_ids_gfx:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_gfx:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()

>From 675f3fc0b282f2ad1877e91b3676ea6ee1e9d77a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 10:18:03 +0000
Subject: [PATCH 2/3] Implement architected SGPR support directly in
 legalization/isel.

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 36 ++++++++++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 36 ++++++++++++-
 .../lower-work-group-id-intrinsics-hsa.ll     | 20 ++++---
 .../lower-work-group-id-intrinsics-pal.ll     |  4 +-
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      | 54 ++++++++++---------
 5 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e94592b..b88d7534f3e26ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4178,10 +4178,42 @@ bool AMDGPULegalizerInfo::loadInputValue(
     Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
-  const ArgDescriptor *Arg;
+  const ArgDescriptor *Arg = nullptr;
   const TargetRegisterClass *ArgRC;
   LLT ArgTy;
-  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // TODO: No need to mask GridY if GridZ is not valid.
+  const ArgDescriptor WorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (ST.hasArchitectedSGPRs() &&
+      AMDGPU::isCompute(B.getMF().getFunction().getCallingConv())) {
+    switch (ArgType) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Arg = &WorkGroupIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Arg = &WorkGroupIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Arg = &WorkGroupIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Arg)
+    std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
 
   if (!Arg) {
     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 073c8cc72117375..2cc0fc1f54ddc7e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2063,11 +2063,43 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   const SIMachineFunctionInfo &MFI,
   EVT VT,
   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
-  const ArgDescriptor *Reg;
+  const ArgDescriptor *Reg = nullptr;
   const TargetRegisterClass *RC;
   LLT Ty;
 
-  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // TODO: No need to mask GridY if GridZ is not valid.
+  const ArgDescriptor WorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (Subtarget->hasArchitectedSGPRs() &&
+      AMDGPU::isCompute(
+          DAG.getMachineFunction().getFunction().getCallingConv())) {
+    switch (PVID) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Reg = &WorkGroupIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Reg = &WorkGroupIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Reg = &WorkGroupIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Reg)
+    std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
   if (!Reg) {
     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
       // It's possible for a kernarg intrinsic call to appear in a kernel with
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index afcaaa7faff7bde..82f9d854f987f09 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -5,11 +5,11 @@
 define amdgpu_kernel void @workgroup_ids_kernel() {
 ; GFX9-SDAG-LABEL: workgroup_ids_kernel:
 ; GFX9-SDAG:       ; %bb.0: ; %.entry
-; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
 ; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -104,13 +104,15 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX9-SDAG-LABEL: workgroup_ids_device_func:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    s_lshr_b32 s4, ttmp7, 16
 ; GFX9-SDAG-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -118,13 +120,15 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX9-GISEL-LABEL: workgroup_ids_device_func:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, ttmp7, 16
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-GISEL-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index b058cec6f6d4630..e3ec6ed6bcfb895 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -5,11 +5,11 @@
 define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX9-SDAG-LABEL: _amdgpu_cs_main:
 ; GFX9-SDAG:       ; %bb.0: ; %.entry
-; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
 ; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index c492b54759d82d7..8c23c73e3163156 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -6,10 +6,10 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ; GCN-SDAG-LABEL: workgroup_id_x:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-SDAG-NEXT:    s_endpgm
 ;
 ; GCN-GISEL-LABEL: workgroup_id_x:
@@ -30,23 +30,25 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GCN-SDAG-LABEL: workgroup_id_xy:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GCN-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-SDAG-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GCN-SDAG-NEXT:    s_endpgm
 ;
 ; GCN-GISEL-LABEL: workgroup_id_xy:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-GISEL-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GCN-GISEL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-GISEL-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GCN-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
@@ -60,33 +62,33 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GCN-SDAG-LABEL: workgroup_id_xyz:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GCN-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-SDAG-NEXT:    s_and_b32 s6, ttmp7, 0xffff
 ; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GCN-SDAG-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-SDAG-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-SDAG-NEXT:    s_endpgm
 ;
 ; GCN-GISEL-LABEL: workgroup_id_xyz:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GCN-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-GISEL-NEXT:    s_and_b32 s6, ttmp7, 0xffff
 ; GCN-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GCN-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
 ; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx

>From dff51a9fd7516fe56d73afc8ba01e52e037610dc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 10:57:43 +0000
Subject: [PATCH 3/3] Remove architected SGPR support from argument handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 30 ++++++++---------
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 32 ++++++-------------
 .../lower-work-group-id-intrinsics-hsa.ll     | 14 ++++----
 3 files changed, 28 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2cc0fc1f54ddc7e..2d7fd51b135bed2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2528,28 +2528,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     }
   }
 
-  if (Info.hasWorkGroupIDX()) {
-    Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+  if (!HasArchitectedSGPRs) {
+    if (Info.hasWorkGroupIDX()) {
+      Register Reg = Info.addWorkGroupIDX();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDY()) {
-    Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDY()) {
+      Register Reg = Info.addWorkGroupIDY();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDZ()) {
-    Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDZ()) {
+      Register Reg = Info.addWorkGroupIDZ();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
-
-    CCInfo.AllocateReg(Reg);
+      CCInfo.AllocateReg(Reg);
+    }
   }
 
   if (Info.hasWorkGroupInfo()) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc63ae44c528dbe..15a0a3586b3aed6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -744,35 +744,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   // Add system SGPRs.
-  Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
-    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDX() {
+    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDX.getRegister();
   }
 
-  Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
-    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDY() {
+    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDY.getRegister();
   }
 
-  Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
-    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDZ() {
+    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDZ.getRegister();
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 82f9d854f987f09..f7fe928489cf44d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -48,20 +48,19 @@ define amdgpu_kernel void @caller() {
 ; GFX9-SDAG-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-SDAG-NEXT:    s_add_u32 s4, s4, callee at gotpcrel32@lo+4
 ; GFX9-SDAG-NEXT:    s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[4:5], 0x0
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: caller:
@@ -74,24 +73,23 @@ define amdgpu_kernel void @caller() {
 ; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-GISEL-NEXT:    s_add_u32 s8, s4, 36
 ; GFX9-GISEL-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee at gotpcrel32@lo+4
 ; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-GISEL-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[14:15]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[12:13]
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[14:15]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   call void @callee(i32 %idx) #0



More information about the llvm-commits mailing list