[llvm] [AMDGPU] Move architected SGPR implementation into isel (PR #79120)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 23 09:06:23 PST 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/79120
>From cdd6b18ff534917f2b1c2e795128dd410371990f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 22 Jan 2024 16:05:29 +0000
Subject: [PATCH 1/5] Precommit tests
---
.../lower-work-group-id-intrinsics-hsa.ll | 277 ++++++++++++++++++
.../lower-work-group-id-intrinsics-pal.ll | 188 ++++++++++++
.../AMDGPU/lower-work-group-id-intrinsics.ll | 128 --------
3 files changed, 465 insertions(+), 128 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
new file mode 100644
index 000000000000000..af1c601ee972ad3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_kernel void @workgroup_ids_kernel() {
+; GFX9-LABEL: workgroup_ids_kernel:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-SDAG: ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT: s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-GISEL: ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_ids_kernel:
+; GFX12-SDAG: ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_ids_kernel:
+; GFX12-GISEL: ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+.entry:
+ %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+ %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+ %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+ %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+ %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+ %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-SDAG-NEXT: s_mov_b32 s38, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s7
+; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36
+; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3]
+; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee at gotpcrel32@lo+4
+; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee at gotpcrel32@hi+12
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0
+; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-SDAG-NEXT: s_mov_b32 s12, s6
+; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-GISEL-NEXT: s_mov_b32 s38, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s7
+; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36
+; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-GISEL-NEXT: s_mov_b32 s12, s6
+; GFX9-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15]
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG: ; %bb.0:
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6
+; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0
+; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36
+; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0
+; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3]
+; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee at gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee at gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s12, ttmp9
+; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-SDAG-NEXT: s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL: ; %bb.0:
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6
+; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36
+; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s12, ttmp9
+; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-SDAG-NEXT: s_mov_b32 s12, ttmp9
+; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-SDAG-NEXT: s_mov_b32 s7, callee at abs32@hi
+; GFX12-SDAG-NEXT: s_mov_b32 s6, callee at abs32@lo
+; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-GISEL-NEXT: s_mov_b32 s12, ttmp9
+; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-GISEL-NEXT: s_mov_b32 s6, callee at abs32@lo
+; GFX12-GISEL-NEXT: s_mov_b32 s7, callee at abs32@hi
+; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX12-GISEL-NEXT: s_endpgm
+ %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void @callee(i32 %idx) #0
+ ret void
+}
+
+declare void @callee(i32) #0
+
+define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_device_func:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, s12
+; GFX9-NEXT: global_store_dword v[0:1], v6, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s13
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-LABEL: workgroup_ids_device_func:
+; GFX9ARCH: ; %bb.0:
+; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-NEXT: v_mov_b32_e32 v6, s12
+; GFX9ARCH-NEXT: global_store_dword v[0:1], v6, off
+; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s13
+; GFX9ARCH-NEXT: global_store_dword v[2:3], v0, off
+; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s14
+; GFX9ARCH-NEXT: global_store_dword v[4:5], v0, off
+; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_device_func:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX12-NEXT: v_mov_b32_e32 v8, s14
+; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+ %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+ %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %id.x, ptr addrspace(1) %outx
+ store volatile i32 %id.y, ptr addrspace(1) %outy
+ store volatile i32 %id.z, ptr addrspace(1) %outz
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
new file mode 100644
index 000000000000000..473b85459d3d31b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_cs void @_amdgpu_cs_main() {
+; GFX9-LABEL: _amdgpu_cs_main:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-SDAG: ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT: s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-GISEL: ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG: ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL: ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+.entry:
+ %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+ %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+ %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+ %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+ %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+ %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_cs void @caller() {
+; GFX9-LABEL: caller:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG: ; %bb.0:
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s10, -1
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, s0
+; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, 0
+; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-SDAG-NEXT: s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL: ; %bb.0:
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s10, -1
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, s0
+; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, 0
+; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GFX12-SDAG-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GFX12-GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT: s_endpgm
+ %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+ call amdgpu_gfx void @callee(i32 %idx)
+ ret void
+}
+
+declare amdgpu_gfx void @callee(i32)
+
+define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_gfx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-LABEL: workgroup_ids_gfx:
+; GFX9ARCH: ; %bb.0:
+; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_gfx:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+ %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+ %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %id.x, ptr addrspace(1) %outx
+ store volatile i32 %id.y, ptr addrspace(1) %outy
+ store volatile i32 %id.z, ptr addrspace(1) %outz
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
deleted file mode 100644
index 495b54758de0493..000000000000000
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-
-define amdgpu_cs void @_amdgpu_cs_main() {
-; GFX9-SDAG-LABEL: _amdgpu_cs_main:
-; GFX9-SDAG: ; %bb.0: ; %.entry
-; GFX9-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX9-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-SDAG-NEXT: s_endpgm
-;
-; GFX9-GISEL-LABEL: _amdgpu_cs_main:
-; GFX9-GISEL: ; %bb.0: ; %.entry
-; GFX9-GISEL-NEXT: s_mov_b32 s0, ttmp9
-; GFX9-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX9-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-GISEL-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: _amdgpu_cs_main:
-; GFX12-SDAG: ; %bb.0: ; %.entry
-; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: _amdgpu_cs_main:
-; GFX12-GISEL: ; %bb.0: ; %.entry
-; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9
-; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
-.entry:
- %idx = call i32 @llvm.amdgcn.workgroup.id.x()
- %idy = call i32 @llvm.amdgcn.workgroup.id.y()
- %idz = call i32 @llvm.amdgcn.workgroup.id.z()
- %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
- %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
- %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
- call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
- ret void
-}
-
-define amdgpu_cs void @caller() {
-; GFX9-SDAG-LABEL: caller:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9]
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s0
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-SDAG-NEXT: s_mov_b32 s5, callee at abs32@hi
-; GFX9-SDAG-NEXT: s_mov_b32 s4, callee at abs32@lo
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT: s_mov_b32 s32, 0
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_add_u32 s8, s8, s0
-; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9]
-; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11]
-; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-SDAG-NEXT: s_endpgm
-;
-; GFX9-GISEL-LABEL: caller:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9]
-; GFX9-GISEL-NEXT: s_mov_b32 s8, s0
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-GISEL-NEXT: s_mov_b32 s4, callee at abs32@lo
-; GFX9-GISEL-NEXT: s_mov_b32 s5, callee at abs32@hi
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT: s_mov_b32 s32, 0
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s8, s8, s0
-; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9]
-; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
-; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-GISEL-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: caller:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX12-SDAG-NEXT: s_mov_b32 s1, callee at abs32@hi
-; GFX12-SDAG-NEXT: s_mov_b32 s0, callee at abs32@lo
-; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
-; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: caller:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX12-GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
-; GFX12-GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
-; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
- %idx = call i32 @llvm.amdgcn.workgroup.id.x()
- call amdgpu_gfx void @callee(i32 %idx)
- ret void
-}
-
-declare amdgpu_gfx void @callee(i32)
-
-declare i32 @llvm.amdgcn.workgroup.id.x()
-declare i32 @llvm.amdgcn.workgroup.id.y()
-declare i32 @llvm.amdgcn.workgroup.id.z()
-declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12: {{.*}}
-; GFX9: {{.*}}
>From c7d065554d04fb102e168b8f6b2ccb1c5f0f29b8 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 10:18:03 +0000
Subject: [PATCH 2/5] Implement architected SGPR support directly in
legalization/isel.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 36 ++++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 ++++-
.../lower-work-group-id-intrinsics-hsa.ll | 66 ++++++---
.../lower-work-group-id-intrinsics-pal.ll | 13 +-
.../AMDGPU/workgroup-id-in-arch-sgprs.ll | 131 +++++++-----------
5 files changed, 170 insertions(+), 112 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e94592b..b88d7534f3e26ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4178,10 +4178,42 @@ bool AMDGPULegalizerInfo::loadInputValue(
Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *Arg;
+ const ArgDescriptor *Arg = nullptr;
const TargetRegisterClass *ArgRC;
LLT ArgTy;
- std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+ const ArgDescriptor WorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP9);
+ // TODO: No need to mask GridY if GridZ is not valid.
+ const ArgDescriptor WorkGroupIDY =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+ const ArgDescriptor WorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ if (ST.hasArchitectedSGPRs() &&
+ AMDGPU::isCompute(B.getMF().getFunction().getCallingConv())) {
+ switch (ArgType) {
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+ Arg = &WorkGroupIDX;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+ Arg = &WorkGroupIDY;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+ Arg = &WorkGroupIDZ;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Arg)
+ std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
if (!Arg) {
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 073c8cc72117375..2cc0fc1f54ddc7e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2063,11 +2063,43 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
- const ArgDescriptor *Reg;
+ const ArgDescriptor *Reg = nullptr;
const TargetRegisterClass *RC;
LLT Ty;
- std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+ const ArgDescriptor WorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP9);
+ // TODO: No need to mask GridY if GridZ is not valid.
+ const ArgDescriptor WorkGroupIDY =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+ const ArgDescriptor WorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ if (Subtarget->hasArchitectedSGPRs() &&
+ AMDGPU::isCompute(
+ DAG.getMachineFunction().getFunction().getCallingConv())) {
+ switch (PVID) {
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+ Reg = &WorkGroupIDX;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+ Reg = &WorkGroupIDY;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+ Reg = &WorkGroupIDZ;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Reg)
+ std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
if (!Reg) {
if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
// It's possible for a kernarg intrinsic call to appear in a kernel with
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index af1c601ee972ad3..063cba73886b40c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -17,11 +17,11 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
;
; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel:
; GFX9ARCH-SDAG: ; %bb.0: ; %.entry
-; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16
; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GFX9ARCH-SDAG-NEXT: s_endpgm
;
@@ -38,11 +38,10 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
;
; GFX12-SDAG-LABEL: workgroup_ids_kernel:
; GFX12-SDAG: ; %bb.0: ; %.entry
-; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1
; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -230,19 +229,37 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9ARCH-LABEL: workgroup_ids_device_func:
-; GFX9ARCH: ; %bb.0:
-; GFX9ARCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9ARCH-NEXT: v_mov_b32_e32 v6, s12
-; GFX9ARCH-NEXT: global_store_dword v[0:1], v6, off
-; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
-; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s13
-; GFX9ARCH-NEXT: global_store_dword v[2:3], v0, off
-; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
-; GFX9ARCH-NEXT: v_mov_b32_e32 v0, s14
-; GFX9ARCH-NEXT: global_store_dword v[4:5], v0, off
-; GFX9ARCH-NEXT: s_waitcnt vmcnt(0)
-; GFX9ARCH-NEXT: s_setpc_b64 s[30:31]
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-SDAG: ; %bb.0:
+; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-SDAG-NEXT: s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT: global_store_dword v[0:1], v6, off
+; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s4, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT: global_store_dword v[2:3], v0, off
+; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT: global_store_dword v[4:5], v0, off
+; GFX9ARCH-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-GISEL: ; %bb.0:
+; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT: s_lshr_b32 s5, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT: global_store_dword v[0:1], v6, off
+; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9ARCH-GISEL-NEXT: global_store_dword v[2:3], v0, off
+; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX9ARCH-GISEL-NEXT: global_store_dword v[4:5], v0, off
+; GFX9ARCH-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: workgroup_ids_device_func:
; GFX12: ; %bb.0:
@@ -251,8 +268,11 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
-; GFX12-NEXT: v_mov_b32_e32 v8, s14
+; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
@@ -275,3 +295,5 @@ declare i32 @llvm.amdgcn.workgroup.id.z()
declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9ARCH: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 473b85459d3d31b..cfff0a969da9e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -14,11 +14,11 @@ define amdgpu_cs void @_amdgpu_cs_main() {
;
; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main:
; GFX9ARCH-SDAG: ; %bb.0: ; %.entry
-; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16
; GFX9ARCH-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX9ARCH-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GFX9ARCH-SDAG-NEXT: s_endpgm
;
@@ -35,11 +35,10 @@ define amdgpu_cs void @_amdgpu_cs_main() {
;
; GFX12-SDAG-LABEL: _amdgpu_cs_main:
; GFX12-SDAG: ; %bb.0: ; %.entry
-; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16
-; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1
; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 769e6b0964abdb2..c2ce6e169dc539a 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -5,43 +5,25 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
-; GFX9-SDAG-LABEL: workgroup_id_x:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX9-GISEL-LABEL: workgroup_id_x:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: workgroup_id_x:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX9-LABEL: workgroup_id_x:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: workgroup_id_x:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX12-LABEL: workgroup_id_x:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
store i32 %idx, ptr addrspace(1) %ptrx
@@ -52,23 +34,25 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
; GFX9-LABEL: workgroup_id_xy:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9
+; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: workgroup_id_xy:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7
+; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: global_store_b32 v0, v2, s[2:3]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v2, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -81,37 +65,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
}
define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) {
-; GFX9-SDAG-LABEL: workgroup_id_xyz:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX9-SDAG-NEXT: s_endpgm
-;
-; GFX9-GISEL-LABEL: workgroup_id_xyz:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16
-; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT: s_endpgm
+; GFX9-LABEL: workgroup_id_xyz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: workgroup_id_xyz:
; GFX12: ; %bb.0:
@@ -119,15 +87,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
-; GFX12-NEXT: global_store_b32 v0, v2, s[6:7]
-; GFX12-NEXT: global_store_b32 v0, v3, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: global_store_b32 v1, v2, s[6:7]
+; GFX12-NEXT: global_store_b32 v1, v3, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -144,3 +112,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
declare i32 @llvm.amdgcn.workgroup.id.x()
declare i32 @llvm.amdgcn.workgroup.id.y()
declare i32 @llvm.amdgcn.workgroup.id.z()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}
>From 13a22d1173e07366480bfd4e7dc020d1e64ce52d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 10:57:43 +0000
Subject: [PATCH 3/5] Remove architected SGPR support from argument handling
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 ++++++++---------
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 32 ++++++-------------
.../AMDGPU/indirect-call-known-callees.ll | 1 -
.../lower-work-group-id-intrinsics-hsa.ll | 4 ---
4 files changed, 22 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2cc0fc1f54ddc7e..2d7fd51b135bed2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2528,28 +2528,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
}
}
- if (Info.hasWorkGroupIDX()) {
- Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (!HasArchitectedSGPRs) {
+ if (Info.hasWorkGroupIDX()) {
+ Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDY()) {
- Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (Info.hasWorkGroupIDY()) {
+ Register Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDZ()) {
- Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (Info.hasWorkGroupIDZ()) {
+ Register Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
-
- CCInfo.AllocateReg(Reg);
+ CCInfo.AllocateReg(Reg);
+ }
}
if (Info.hasWorkGroupInfo()) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ecc31fbd9dd3d8b..71513e3ea98e4f6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -744,35 +744,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
}
// Add system SGPRs.
- Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
- ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDX() {
+ ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDX.getRegister();
}
- Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
- unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
- ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDY() {
+ ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDY.getRegister();
}
- Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
- unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
- ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDZ() {
+ ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDZ.getRegister();
}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 380a13ed16128f8..47110d94918879d 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
; GFX12-NEXT: v_mov_b32_e32 v31, v0
-; GFX12-NEXT: s_mov_b32 s12, ttmp9
; GFX12-NEXT: s_mov_b64 s[8:9], 0
; GFX12-NEXT: s_mov_b32 s32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 063cba73886b40c..afa914c8375f64a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -145,7 +145,6 @@ define amdgpu_kernel void @caller() {
; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9ARCH-SDAG-NEXT: s_mov_b32 s12, ttmp9
; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
@@ -173,7 +172,6 @@ define amdgpu_kernel void @caller() {
; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9ARCH-GISEL-NEXT: s_mov_b32 s12, ttmp9
; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
@@ -185,7 +183,6 @@ define amdgpu_kernel void @caller() {
; GFX12-SDAG-LABEL: caller:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-SDAG-NEXT: s_mov_b32 s12, ttmp9
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX12-SDAG-NEXT: s_mov_b32 s7, callee at abs32@hi
; GFX12-SDAG-NEXT: s_mov_b32 s6, callee at abs32@lo
@@ -198,7 +195,6 @@ define amdgpu_kernel void @caller() {
; GFX12-GISEL-LABEL: caller:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-GISEL-NEXT: s_mov_b32 s12, ttmp9
; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX12-GISEL-NEXT: s_mov_b32 s6, callee at abs32@lo
; GFX12-GISEL-NEXT: s_mov_b32 s7, callee at abs32@hi
>From b5cc9f2111308d59744c873aec13e6aa0069248d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 15:00:47 +0000
Subject: [PATCH 4/5] Optimize for workgroup ID Y in entry functions
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +++++----
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++-----
.../AMDGPU/workgroup-id-in-arch-sgprs.ll | 18 ++++++++----------
3 files changed, 18 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b88d7534f3e26ab..9c7da02d1cad88d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4182,15 +4182,16 @@ bool AMDGPULegalizerInfo::loadInputValue(
const TargetRegisterClass *ArgRC;
LLT ArgTy;
+ CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
// TODO: No need to mask GridY if GridZ is not valid.
- const ArgDescriptor WorkGroupIDY =
- ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+ const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+ AMDGPU::TTMP7,
+ AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
- if (ST.hasArchitectedSGPRs() &&
- AMDGPU::isCompute(B.getMF().getFunction().getCallingConv())) {
+ if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
switch (ArgType) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d7fd51b135bed2..a86c49ca45a8825 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2067,16 +2067,16 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const TargetRegisterClass *RC;
LLT Ty;
+ CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
// TODO: No need to mask GridY if GridZ is not valid.
- const ArgDescriptor WorkGroupIDY =
- ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFFu);
+ const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+ AMDGPU::TTMP7,
+ AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
- if (Subtarget->hasArchitectedSGPRs() &&
- AMDGPU::isCompute(
- DAG.getMachineFunction().getFunction().getCallingConv())) {
+ if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index c2ce6e169dc539a..40e4692a18ec79f 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -35,24 +35,22 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_store_dword v1, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: workgroup_id_xy:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: global_store_b32 v1, v2, s[2:3]
+; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
>From 4e93c15e3eb29495edf5ede0fecf433632130b64 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jan 2024 17:06:09 +0000
Subject: [PATCH 5/5] Update TODO comments
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9c7da02d1cad88d..fc02766a4b27ad9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4185,7 +4185,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
- // TODO: No need to mask GridY if GridZ is not valid.
+ // If GridZ is not programmed in an entry function then the hardware will set
+ // it to all zeros, so there is no need to mask the GridY value in the low
+ // order bits.
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
AMDGPU::TTMP7,
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a86c49ca45a8825..b81ec6629d83464 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2070,7 +2070,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
- // TODO: No need to mask GridY if GridZ is not valid.
+ // If GridZ is not programmed in an entry function then the hardware will set
+ // it to all zeros, so there is no need to mask the GridY value in the low
+ // order bits.
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
AMDGPU::TTMP7,
AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
More information about the llvm-commits
mailing list