[llvm] [AMDGPU][CodeGen] enable D16Writes32BitVgpr for gfx12 (PR #165587)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 31 12:00:25 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/165587

>From a4d3236dd7b572e44ff02a6f9da98d4bc840d067 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 28 Oct 2025 14:35:17 -0400
Subject: [PATCH] set D16 HW fix for gfx12

---
 llvm/lib/Target/AMDGPU/AMDGPU.td     |   3 +-
 llvm/test/CodeGen/AMDGPU/spillv16.ll | 590 +++++++++++++++++++++------
 2 files changed, 465 insertions(+), 128 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..84121138a40df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1614,7 +1614,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
-   FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureD16Writes32BitVgpr
   ]
 >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 0e45df223465d..986fb186cd001 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -1,23 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32
 
 define void @spill_i16_alu() {
-; GCN-TRUE16-LABEL: spill_i16_alu:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_i16_alu:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-FAKE16-LABEL: spill_i16_alu:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: spill_i16_alu:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_i16_alu:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49,26 +85,95 @@ entry:
 }
 
 define void @spill_i16_alu_two_vals() {
-; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
-; GCN-TRUE16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX11-TRUE16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:4 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v1, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W32:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_i16_alu_two_vals:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -113,20 +218,52 @@ entry:
 ; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
 
 define void @spill_i16() {
-; GCN-TRUE16-LABEL: spill_i16:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-FAKE16-LABEL: spill_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: spill_i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_i16:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,20 +293,52 @@ entry:
 }
 
 define void @spill_half() {
-; GCN-TRUE16-LABEL: spill_half:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_half:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: spill_half:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_half:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_half:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -199,20 +368,52 @@ entry:
 }
 
 define void @spill_i16_from_v2i16() {
-; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_i16_from_v2i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-FAKE16-LABEL: spill_i16_from_v2i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: spill_i16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,27 +443,73 @@ entry:
 }
 
 define void @spill_2xi16_from_v2i16() {
-; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -303,27 +550,70 @@ entry:
 }
 
 define void @spill_2xi16_from_v2i16_one_free_reg() {
-; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v7, off, s32 offset:2 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v7, s32 offset:2 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
 ; GCN-FAKE16:       ; %bb.0: ; %entry
 ; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -362,19 +652,65 @@ entry:
 }
 
 define void @spill_v2i16() {
-; GCN-LABEL: spill_v2i16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
-; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: spill_v2i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: spill_v2i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    ;;#ASMSTART
+; GFX11-FAKE16-NEXT:    ;;#ASMEND
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: spill_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GCN-FAKE16-LABEL: spill_v2i16:
+; GCN-FAKE16:       ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT:    ;;#ASMSTART
+; GCN-FAKE16-NEXT:    ;;#ASMEND
+; GCN-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
+; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 



More information about the llvm-commits mailing list