[llvm] [AMDGPU] Rework architected SGPRs implementation (PR #79001)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 22 08:06:43 PST 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/79001

Rework the architected SGPRs implementation such that workgroup id
values (which live in TTMP registers) are available in all functions and
do not rely on calling allocateSystemSGPRs to set them up.


>From 9ba45ba9753b305641eef023976dfc1a10f7a667 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 22 Jan 2024 16:05:29 +0000
Subject: [PATCH] [AMDGPU] Rework architected SGPRs implementation

Rework the architected SGPRs implementation such that workgroup id
values (which live in TTMP registers) are available in all functions and
do not rely on calling allocateSystemSGPRs to set them up.
---
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp |  12 +-
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.h   |   5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  30 ++--
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  13 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  30 ++--
 .../AMDGPU/amdgcn-load-offset-from-reg.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll |   4 +-
 .../lower-work-group-id-intrinsics-hsa.ll     | 149 ++++++++++++++++++
 ... => lower-work-group-id-intrinsics-pal.ll} |  41 +++++
 .../AMDGPU/pal-simple-indirect-call.ll        |   5 +-
 llvm/test/CodeGen/AMDGPU/scratch-simple.ll    |  34 ++--
 11 files changed, 263 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
 rename llvm/test/CodeGen/AMDGPU/{lower-work-group-id-intrinsics.ll => lower-work-group-id-intrinsics-pal.ll} (65%)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index de25f9241a5036..c6c7ee489b5ac6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -100,13 +100,19 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
                       &AMDGPU::SGPR_64RegClass,
                       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
-    return std::tuple(WorkGroupIDX ? &WorkGroupIDX : nullptr,
+    return std::tuple(ArchitectedWorkGroupIDX ? &ArchitectedWorkGroupIDX
+                      : WorkGroupIDX          ? &WorkGroupIDX
+                                              : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
-    return std::tuple(WorkGroupIDY ? &WorkGroupIDY : nullptr,
+    return std::tuple(ArchitectedWorkGroupIDY ? &ArchitectedWorkGroupIDY
+                      : WorkGroupIDY          ? &WorkGroupIDY
+                                              : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
-    return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
+    return std::tuple(ArchitectedWorkGroupIDZ ? &ArchitectedWorkGroupIDZ
+                      : WorkGroupIDZ          ? &WorkGroupIDZ
+                                              : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
     return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..91b585cf7261d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -143,6 +143,11 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor WorkGroupInfo;
   ArgDescriptor PrivateSegmentWaveByteOffset;
 
+  // System TTMPs.
+  ArgDescriptor ArchitectedWorkGroupIDX;
+  ArgDescriptor ArchitectedWorkGroupIDY;
+  ArgDescriptor ArchitectedWorkGroupIDZ;
+
   // Pointer with offset from kernargsegmentptr to where special ABI arguments
   // are passed to callable functions.
   ArgDescriptor ImplicitArgPtr;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 073c8cc7211737..651e5a0a6335ff 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2496,28 +2496,27 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     }
   }
 
-  if (Info.hasWorkGroupIDX()) {
-    Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+  if (!HasArchitectedSGPRs) {
+    if (Info.hasWorkGroupIDX()) {
+      Register Reg = Info.addWorkGroupIDX();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
 
-    CCInfo.AllocateReg(Reg);
-  }
+      CCInfo.AllocateReg(Reg);
+    }
 
-  if (Info.hasWorkGroupIDY()) {
-    Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDY()) {
+      Register Reg = Info.addWorkGroupIDY();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
 
-    CCInfo.AllocateReg(Reg);
-  }
+      CCInfo.AllocateReg(Reg);
+    }
 
-  if (Info.hasWorkGroupIDZ()) {
-    Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDZ()) {
+      Register Reg = Info.addWorkGroupIDZ();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
 
-    CCInfo.AllocateReg(Reg);
+      CCInfo.AllocateReg(Reg);
+    }
   }
 
   if (Info.hasWorkGroupInfo()) {
@@ -2722,9 +2721,6 @@ SDValue SITargetLowering::LowerFormalArguments(
     (void)UserSGPRInfo;
     if (!Subtarget->enableFlatScratch())
       assert(!UserSGPRInfo.hasFlatScratchInit());
-    if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
-      assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-             !Info->hasWorkGroupIDZ());
   }
 
   if (CallConv == CallingConv::AMDGPU_PS) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e8142244b7db69..4efd8560cf1354 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -107,8 +107,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
   }
 
-  if (!AMDGPU::isGraphics(CC) ||
-      (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) {
+  if (!AMDGPU::isGraphics(CC) || CC == CallingConv::AMDGPU_CS ||
+      ST.hasArchitectedSGPRs()) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
 
@@ -169,6 +169,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     VGPRForAGPRCopy =
         AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
   }
+
+  if (STI->hasArchitectedSGPRs()) {
+    ArgInfo.ArchitectedWorkGroupIDX =
+        ArgDescriptor::createRegister(AMDGPU::TTMP9);
+    ArgInfo.ArchitectedWorkGroupIDY = ArgDescriptor::createRegister(
+        AMDGPU::TTMP7, WorkGroupIDZ ? 0xFFFFu : ~0u);
+    ArgInfo.ArchitectedWorkGroupIDZ =
+        ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  }
 }
 
 MachineFunctionInfo *SIMachineFunctionInfo::clone(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc63ae44c528db..ecc29da85ccb68 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -744,34 +744,26 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   // Add system SGPRs.
-  Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
+  Register addWorkGroupIDX() {
+    Register Reg = getNextSystemSGPR();
     ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
+    NumSystemSGPRs += 1;
 
     return ArgInfo.WorkGroupIDX.getRegister();
   }
 
-  Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
-    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
+  Register addWorkGroupIDY() {
+    Register Reg = getNextSystemSGPR();
+    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg);
+    NumSystemSGPRs += 1;
 
     return ArgInfo.WorkGroupIDY.getRegister();
   }
 
-  Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
-    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
+  Register addWorkGroupIDZ() {
+    Register Reg = getNextSystemSGPR();
+    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg);
+    NumSystemSGPRs += 1;
 
     return ArgInfo.WorkGroupIDZ.getRegister();
   }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 77976e470fc789..5462764facfe69 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -147,7 +147,7 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_nuw(<4 x i32> inreg
 ; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
 ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
-; GISEL-DAG: %[[ADD:.*]]:sreg_32 = nsw S_ADD_I32 %1, %10, implicit-def dead $scc
+; GISEL-DAG: %[[ADD:.*]]:sreg_32 = nsw S_ADD_I32 %1, %13, implicit-def dead $scc
 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[ADD]], 0,
 define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_nsw(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) #0 {
     %off = add nsw i32 %i, 77
@@ -171,7 +171,7 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_nsw(<4 x i32> inreg
 ; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
 ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
-; GISEL-DAG: %[[ADD:.*]]:sreg_32 = S_ADD_I32 %1, %10, implicit-def dead $scc
+; GISEL-DAG: %[[ADD:.*]]:sreg_32 = S_ADD_I32 %1, %13, implicit-def dead $scc
 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[ADD]], 0,
 define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_noflags(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) #0 {
     %off = add i32 %i, 77
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
index e9d9b669408ac5..f389d54702aa03 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -16,8 +16,8 @@ define amdgpu_ps i32 @test_ps() #1 {
 }
 
 ; GCN-LABEL: {{^}}test_cs:
-; GCN: s_mov_b64 s[4:5], s[0:1]
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4
+; GCN: s_mov_b64 s[8:9], s[0:1]
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[8:11], 0 offset:4
 ; GCN: s_load_dword s0, s[0:1], 0x0
 define amdgpu_cs i32 @test_cs() #1 {
   %alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
new file mode 100644
index 00000000000000..34003849fd7cde
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
+
+define amdgpu_kernel void @workgroup_ids_kernel() {
+; GFX9-SDAG-LABEL: workgroup_ids_kernel:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_ids_kernel:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s8
+; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s4, 36
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-SDAG-NEXT:    s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[4:5], 0x0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, ttmp9
+; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s8
+; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s4, 36
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, ttmp9
+; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[14:15]
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call void @callee(i32 %idx) #0
+  ret void
+}
+
+declare void @callee(i32) #0
+
+define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-SDAG-LABEL: workgroup_ids_device_func:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    s_lshr_b32 s4, ttmp7, 16
+; GFX9-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_device_func:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s6, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, ttmp7, 16
+; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
similarity index 65%
rename from llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
rename to llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index c732ff70942550..2065dcd9779447 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -75,6 +75,47 @@ define amdgpu_cs void @caller() {
 
 declare amdgpu_gfx void @callee(i32)
 
+define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-SDAG-LABEL: workgroup_ids_gfx:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-SDAG-NEXT:    s_and_b32 s34, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-SDAG-NEXT:    s_lshr_b32 s34, ttmp7, 16
+; GFX9-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: workgroup_ids_gfx:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s36, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9-GISEL-NEXT:    s_lshr_b32 s35, ttmp7, 16
+; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s36
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s35
+; GFX9-GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index 2e9f09ad41813d..7c3604c2a6299d 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -34,12 +34,13 @@ define amdgpu_cs void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: test_simple_indirect_call:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_getpc_b64 s[8:9]
@@ -49,7 +50,7 @@ define amdgpu_cs void @test_simple_indirect_call() {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bitset0_b32 s11, 21
-; GFX10-NEXT:    s_add_u32 s8, s8, s0
+; GFX10-NEXT:    s_add_u32 s8, s8, s3
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; GFX10-NEXT:    s_mov_b64 s[2:3], s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 272daac3e03c29..59a0d8242cb295 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -142,36 +142,36 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}cs_main:
-; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
 
-; GFX10-FLATSCR: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR: s_add_u32 s0, s0, s5
 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 
-; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
-; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[4:5]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s4, s0
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
-; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
-; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s5, s5, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s4, s3
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s5, 0
 
-; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
-; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
-; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[4:5]
+; GFX10-FLATSCR-PAL: s_mov_b32 s4, s0
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
-; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
-; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
-; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
-; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL: s_and_b32 s5, s5, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s4, s4, s3
+; GFX10-FLATSCR-PAL: s_addc_u32 s5, s5, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 
-; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; MUBUF-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
 
 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
 



More information about the llvm-commits mailing list